Recherche de tag: kmer


Simple Fast Kmer counter (with small k) [C++]

28.01.2019     Natir      cpp kmer counter 

  A simple and fast kmer counter.

limitation:
- k must always be odd
- k must be choose at compile time
- the memory usage is 2 ^ (k*2-1) bytes
- in this version the maximum count of a kmer is 255 (see comments)
- in this version the maximum value of k is 31 (see comments)
- forward and reverse kmers are counted at the same time


The main function provides a simple example.
#include <bitset>

template <size_t K>
using kmer_t = typename std::conditional<
  K <= 4 && K % 2 == 1,
  std::uint_least8_t,
  typename std::conditional<
    K <= 8 && K % 2 == 1,
    std::uint16_t,
    typename std::conditional<
      K <= 16 && K % 2 == 1,
      std::uint32_t,
      typename std::conditional<
        K <= 32 && K % 2 == 1,
        std::uint64_t,
        std::false_type
      >::type
    >::type
  >::type
>::type;

template <std::uint8_t K>
constexpr std::uint64_t comp_mask() {
  if(K <= 4 && K % 2 == 1) {
    return 0b10101010;
  } else if(K <= 8 && K % 2 == 1) {
    return 0b1010101010101010;
  } else if(K <= 16 && K % 2 == 1) {
    return 0b10101010101010101010101010101010;
  } else if(K <= 32 && K % 2 == 1) {
    return 0b1010101010101010101010101010101010101010101010101010101010101010;
  } else {
    return 0;
  }
}

template <std::uint8_t K>
constexpr std::uint64_t max_k() {
  if(K <= 4) {
    return 4;
  } else if(K <= 8) {
    return 8;
  } else if(K <= 16) {
    return 16;
  } else if(K <= 32) {
    return 32;
  } else {
    return 0;
  }
}	     

template<std::uint8_t K>
kmer_t<K> seq2bit(std::string seq) {
  kmer_t<K> kmer = 0;
  
  for(auto n: seq) {
    kmer <<= 2;
    kmer |= ((n >> 1) & 0b11);
  }
  
  return kmer;
}

template<std::uint8_t K>
kmer_t<K> reverse_2(kmer_t<K> kmer) {
  kmer_t<K> reversed = 0;

  for(kmer_t<K> i = 0; i < K-1; ++i) {
    reversed = (reversed ^ (kmer & 0b11)) << 2;
    kmer >>= 2;
  }
  
  return reversed ^ (kmer & 0b11);
}

template<std::uint8_t K>
uint8_t parity(kmer_t<K> kmer) {
  return std::bitset<K*2>(kmer).count() % 2; 
}
 
template<std::uint8_t K>
kmer_t<K> get_cannonical(kmer_t<K> kmer) {
  uint8_t cleaning_move = (max_k<K>() - K) * 2;

  if(parity<K>(kmer) == 0) {
    return kmer;
  } else {
    
    kmer_t<K> hash = (reverse_2<K>(kmer) ^ comp_mask<K>());
    hash <<= cleaning_move;
    hash >>= cleaning_move;
        
    return hash;
  }
}

template<std::uint8_t K>
kmer_t<K> hash_kmer(kmer_t<K> kmer) {
  return get_cannonical<K>(kmer) >> 1;
}

#include <iostream>
#include <vector>

int main(int argc, char* argv[]) {

  constexpr std::uint8_t k = 3; // must be odd and minus than 32 change test at line 119

  std::string seq = "AAACCCTTTGGG";

  // To increase maximal count change uint8_t, to unint16_t, uint32_t or uint64_t
  std::vector<std::uint8_t> kmer2count(1 << (k * 2 -1), 0); // 1 << n <-> 2^n

  for(size_t i = 0; i != seq.length() - k + 1; i++) {
    std::string sub_seq = seq.substr(i, k);
    
    uint32_t hash = hash_kmer<k>(seq2bit<k>(sub_seq));

    if(kmer2count[hash] != 255) {
      kmer2count[hash]++;
    }
  }
  
  for(size_t i = 0; i != kmer2count.size(); i++) {
    std::cout<<int(kmer2count[i])<<" ";
  }
  std::cout<<std::endl;
  
  return 0;
}
5/5 - [1 rating]