{ Les snippets de bioinfo-fr.net }


Temps d'exécution des examples d'un package [R]

18.06.2020     Kumquatum      R package dev example 

  Connaître le temps que met chaque exemple d'une fonction à s'exécuter permet de déterminer lesquelles il faut passer en "do not run" pour ne pas dépasser le temps de R CMD check
library(magrittr)

# Getting function name
functions <- list.files("man") %>%
  strsplit(".Rd")

# Timing function exection
times <- lapply(functions, function(func){
  time_exe <- suppressWarnings({ # fun
      system.time({ example(func, "GWENA",
                                      local = new.env(),
                                      character.only = TRUE, 
                                      ask = FALSE,
                                      echo = FALSE)})
    })
  return(time_exe)
}) %>%
  rlist::list.rbind() %>%
  set_rownames(unlist(functions)) %>%
  as.data.frame() %>%
  .[order(.$user.self, decreasing = TRUE),]
0/5 - [0 vote]




exemple d'utilisation du module "pyensembl" via un NoteBook Jupyter (Anaconda) [Python]

25.09.2019     erwan06      Anaconda Jupyter pyensembl 

  on va comparer les références de transcrits (identifiants Ensembl grch38) avec le fichier de référence "VariantNGS.tsv" suivant:
Gene name Accession Number
Gene name Accession Number
VHL ENST00000256474
SETD2 ENST00000409792
PBRM1 ENST00000337303
FHIT ENST00000341848
RASSF1 ENST00000357043
KDM5C ENST00000375401
MITF ENST00000394351
BAP1 ENST00000460680
KDM6A ENST00000377967
ABL1 ENST00000318560
AKT1 ENST00000349310
ALK ENST00000389048
APC ENST00000457016
ATM ENST00000278616
BRAF ENST00000288602
CDH1 ENST00000261769
CDKN2A ENST00000304494
CSF1R ENST00000286301
CTNNB1 ENST00000349496
DDR2 ENST00000367922
EGFR ENST00000275493
ERBB2 ENST00000269571
ERBB4 ENST00000342788
EZH2 ENST00000320356
FBXW7 ENST00000281708
FGFR1 ENST00000447712
FGFR2 ENST00000358487
FGFR3 ENST00000440486
FLT3 ENST00000241453
GNA11 ENST00000078429
GNAQ ENST00000286548
GNAS ENST00000371085
HNF1A ENST00000257555
HRAS ENST00000397596
IDH1 ENST00000345146
IDH2 ENST00000330062
JAK2 ENST00000381652
JAK3 ENST00000458235
KDR ENST00000263923
KIT ENST00000288135
KRAS ENST00000311936
MAP2K1 ENST00000307102
MET ENST00000318493
MLH1 ENST00000231790
MPL ENST00000372470
NOTCH1 ENST00000277541
NPM1 ENST00000517671
NRAS ENST00000369535
PDGFRA ENST00000257290
PIK3CA ENST00000263967
PTEN ENST00000371953
PTPN11 ENST00000351677
RB1 ENST00000267163
RET ENST00000355710
SMAD4 ENST00000342988
SMARCB1 ENST00000263121
SMO ENST00000249373
SRC ENST00000445403
STK11 ENST00000326873
TP53 ENST00000269305


#!/usr/bin/env python
# coding: utf-8

# # Installing PyEnsembl
# `PyEnsembl`is a Python interface to Ensembl reference genome metadata such as exons and transcripts. `PyEnsembl` downloads GTF and FASTA files from the Ensembl FTP server and loads them into a local database. PyEnsembl can also work with custom reference data specified using user-supplied GTF and FASTA files.

# In[52]:

import pyensembl


# In[53]:

from pyensembl import EnsemblRelease


# In[54]:

data = EnsemblRelease(78)


# In[55]:

import os, sys


# In[56]:

os.path.isfile("TruSeq.txt")


# In[57]:

f = open("TruSeq.txt",'r',encoding='utf8')


# In[58]:

L = f.readlines()


# In[59]:

f.close()


# In[60]:

genes = [str(L[i]).rstrip("\n") for i in range(1,len(L))]


# In[61]:

liste = [str(genes[i]).split('\t') for i in range(len(genes))]


# In[62]:

liste_simple = []
for elt in range(0,len(liste)):
    liste_simple = liste_simple + liste[elt]


# In[63]:

print(liste_simple)


# ## get all Genes ID from TrueSeq Amplicon cancer panel

# In[64]:

f = open("TruSeq_geneIDs.txt",'w',encoding = 'utf8')
tTruSeq = []

for i in range(0,len(liste_simple)):
    gene_courant = str(liste_simple[i]).rstrip()
    gene_ID = pyensembl.ensembl_grch38.genes_by_name(gene_courant)[0].gene_id
    f.write("gene: {} , gene_ID : {} \n".format(str(liste_simple[i]),gene_ID))
    tTruSeq = [gene_ID, pyensembl.ensembl_grch38.genes_by_name(gene_courant)[0].transcripts] + tTruSeq
f.close()
    

# In[65]:

f = open("genotype_transcripts.txt",'w',encoding = 'utf8')
f.write(str(tTruSeq))
f.close()


# ## compare genotype Transcript IDs vs NGS phenotypic transcripts (VariantsNGS.tsv)

# In[66]:

import os, sys


# In[67]:

import pyensembl


# In[68]:

os.path.isfile("VariantsNGS.tsv")


# In[69]:

f = open("VariantsNGS.tsv",'r',encoding='utf8')


# In[70]:

L2 = f.readlines() 


# In[71]:

f.close()


# In[72]:

genes = [str(L2[i]).rstrip("\n") for i in range(1,len(L2))]


# In[73]:

liste2 = [str(genes[i]).split('\t') for i in range(len(genes))]


# In[74]:

liste_simple2 = []
for elt in range(0,len(liste2)):
    liste_simple2 = liste_simple2 + liste2[elt]

# In[75]:

print(liste_simple2)

# ## manual cross-check of each transcript IDs

# In[76]:

VHL = pyensembl.ensembl_grch38.genes_by_name("VHL")


# In[77]:

tVHL = VHL[0].transcripts


# In[78]:

print(tVHL[0].transcript_id)


# same Transcript IDs found

# In[79]:

SETD2 = pyensembl.ensembl_grch38.genes_by_name("SETD2")
tSETD2 = SETD2[0].transcripts
print(tSETD2[0].transcript_id)


# different transcript IDs found : ENST00000431180 != ENST00000409792

# l'automatisation de la comparaison des deux listes simples n'est pas encore terminee




0/5 - [0 vote]




Couleur moyenne d'un vecteur de couleur en RGB + alpha [R]

30.07.2019     gdevailly      R RGB HTML couleurs 

  Petite fonction retournant la couleur moyenne d'un vecteur de couleur en espace RGB (rouge, vert, bleu) + canal alpha (transparence).
Il y a des espaces colorimétriques sans plus adapté à ce genre d'opérations.
mean_color <- function(mycolors) {
    R     <- strtoi(x = substr(mycolors,2,3), base = 16)
    G     <- strtoi(x = substr(mycolors,4,5), base = 16)
    B     <- strtoi(x = substr(mycolors,6,7), base = 16)
    alpha <- strtoi(x = substr(mycolors,8,9), base = 16)

    return(
        rgb(
            red   = round(mean(R)),
            green = round(mean(G)),
            blue  = round(mean(B)),
            alpha = round(mean(alpha)),
            maxColorValue = 255
        )
    )
}

mean_color(c("#000000FF", "#FFFFFFFF"))
mean_color(c("#FF0000FF", "#00FF00FF", "#FFFF00FF", "#FF0000FF"))
mean_color(rainbow(8))
0/5 - [0 vote]




doi2bib [Bash]

02.07.2019     Natir      doi bibtex 

  A bash function to get bibtex from doi
doi2bib ()
{
    curl -H "Accept: application/x-bibtex; charset=utf-8" https://data.crossref.org/${1}
}

# call doi2bib and write result in clipboard
doi2clip ()
{
    doi2bib ${1} | xclip -selection c
}

## Usage 
## > doi2bib 10.1093/bioinformatics/btz219 
## @article{Marijon_2019,
##     doi = {10.1093/bioinformatics/btz219},
##     url = {https://doi.org/10.1093%2Fbioinformatics%2Fbtz219},
##     year = 2019,
##     month = {mar},
##     publisher = {Oxford University Press ({OUP})},
##     author = {Pierre Marijon and Rayan Chikhi and Jean-St{\'{e}}phane Varr{\'{e}}},
##     editor = {John Hancock},
##     title = {Graph analysis of fragmented long-read bacterial genome assemblies},
##     journal = {Bioinformatics}
## }
4.75/5 - [3 votes]




recherche d une sequence mystere a l aide de biopython [Python]

04.04.2019     erwan06      biopython fasta arabette photosynthese 

  ce snippet reprend le code de recherche de la protéine codée vraisemblable comme cela est décrit sur le site http://arn16s.ovh (étape n°3) à partir de la séquence fasta "U91966". La protéine codée par l'une des six phases est alors affichée (Rubisco)
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

#on importe la sequence mystere avec les outils en ligne

def seq_codante_de_phase(phase):
    meilleure_seq=compteur=i=j=start=stop=0
    longueur = len(phase)-1
    for compteur in range (i,longueur):
        if (phase[i] != 'M'):
            i += 1
        else:
            start = i
            j = start
            while (phase[j] != '*' and j < len(phase)-1):
                j += 1
            if (phase[j] == "*"):
                stop = j
            seq_codante = phase[i:j]
            if (len(seq_codante) > meilleure_seq):
                meilleure_seq = len(seq_codante)
                meilleur_start = start
                meilleur_stop = stop
                proteine_de_phase = phase[meilleur_start+1:meilleur_stop]
            i += 1
    return(proteine_de_phase)        

#main

mysterious_sequence = SeqIO.read(open('my_sequence.fasta'),
'fasta',
alphabet=generic_dna).seq


phase_1 = mysterious_sequence[0::]
phase_2 = mysterious_sequence[1::]
phase_3 = mysterious_sequence[2::]

#on renverse la sequence mystere pour creer les 3 dernieres phases 
complement_sequence = mysterious_sequence.complement()
reverse_sequence = complement_sequence[::-1]

#sequence à partir du dernier nucleotide en sens contraire
phase_4 = reverse_sequence[0::]
#sequence à partir de l'avant-dernier nucleotide en sens contraire
phase_5 = reverse_sequence[1::]
#sequence à partir de l'antépénultième nucleotide en sens contraire 
phase_6 = reverse_sequence[2::]

prot_1 = str(phase_1.translate())
prot_2 = str(phase_2.translate())
prot_3 = str(phase_3.translate())
prot_4 = str(phase_4.translate())
prot_5 = str(phase_5.translate())
prot_6 = str(phase_6.translate())

liste=[seq_codante_de_phase(prot_1),seq_codante_de_phase(prot_2),seq_codante_de_phase(prot_3),seq_codante_de_phase(prot_3),seq_codante_de_phase(prot_4),seq_codante_de_phase(prot_5),seq_codante_de_phase(prot_6)]
sorted(liste, key=len)
#la phase vraisemblable est la plus longue, donc la première de la liste triee par longueur
print (liste[1])
0/5 - [0 vote]




Average timeit [Python]

26.02.2019     Yo_O      timer python chronophage temps benchmark 

  Une fonction timeit permettant de placer @timeit en decorateur d'une methode à chronométrer.
Petit bonus : joue n fois la méthode décorée et rend le temps moyen.

source originale : https://github.com/realpython/materials/blob/master/pandas-fast-flexible-intuitive/tutorial/timer.py
import functools
import gc
import itertools
import sys
from timeit import default_timer as _timer


def timeit(_func=None, *, repeat=3, number=1000, file=sys.stdout):
    """Decorator: prints time from best of `repeat` trials.
    Mimics `timeit.repeat()`, but avg. time is printed.
    Returns function result and prints time.
    You can decorate with or without parentheses, as in
    Python's @dataclass class decorator.
    kwargs are passed to `print()`.
    >>> @timeit
    ... def f():
    ...     return "-".join(str(n) for n in range(100))
    ...
    >>> @timeit(number=100000)
    ... def g():
    ...     return "-".join(str(n) for n in range(10))
    ...
    """

    _repeat = functools.partial(itertools.repeat, None)

    def wrap(func):
        @functools.wraps(func)
        def _timeit(*args, **kwargs):
            # Temporarily turn off garbage collection during the timing.
            # Makes independent timings more comparable.
            # If it was originally enabled, switch it back on afterwards.
            gcold = gc.isenabled()
            gc.disable()

            try:
                # Outer loop - the number of repeats.
                trials = []
                for _ in _repeat(repeat):
                    # Inner loop - the number of calls within each repeat.
                    total = 0
                    for _ in _repeat(number):
                        start = _timer()
                        result = func(*args, **kwargs)
                        end = _timer()
                        total += end - start
                    trials.append(total)

                # We want the *average time* from the *best* trial.
                # For more on this methodology, see the docs for
                # Python's `timeit` module.
                #
                # "In a typical case, the lowest value gives a lower bound
                # for how fast your machine can run the given code snippet;
                # higher values in the result vector are typically not
                # caused by variability in Python’s speed, but by other
                # processes interfering with your timing accuracy."
                best = min(trials) / number
                print(
                    "Best of {} trials with {} function"
                    " calls per trial:".format(repeat, number)
                )
                print(
                    "Function `{}` ran in average"
                    " of {:0.3f} seconds.".format(func.__name__, best),
                    end="\n\n",
                    file=file,
                )
            finally:
                if gcold:
                    gc.enable()
            # Result is returned *only once*
            return result

        return _timeit

    # Syntax trick from Python @dataclass
    if _func is None:
        return wrap
    else:
        return wrap(_func)
5/5 - [1 vote]




Simple Fast Kmer counter (with small k) [C++]

28.01.2019     Natir      cpp kmer counter 

  A simple and fast kmer counter.

limitation:
- k must always be odd
- k must be choose at compile time
- the memory usage is 2 ^ (k*2-1) bytes
- in this version the maximum count of a kmer is 255 (see comments)
- in this version the maximum value of k is 31 (see comments)
- forward and reverse kmers are counted at the same time


The main function provides a simple example.
#include <bitset>

template <size_t K>
using kmer_t = typename std::conditional<
  K <= 4 && K % 2 == 1,
  std::uint_least8_t,
  typename std::conditional<
    K <= 8 && K % 2 == 1,
    std::uint16_t,
    typename std::conditional<
      K <= 16 && K % 2 == 1,
      std::uint32_t,
      typename std::conditional<
        K <= 32 && K % 2 == 1,
        std::uint64_t,
        std::false_type
      >::type
    >::type
  >::type
>::type;

template <std::uint8_t K>
constexpr std::uint64_t comp_mask() {
  if(K <= 4 && K % 2 == 1) {
    return 0b10101010;
  } else if(K <= 8 && K % 2 == 1) {
    return 0b1010101010101010;
  } else if(K <= 16 && K % 2 == 1) {
    return 0b10101010101010101010101010101010;
  } else if(K <= 32 && K % 2 == 1) {
    return 0b1010101010101010101010101010101010101010101010101010101010101010;
  } else {
    return 0;
  }
}

template <std::uint8_t K>
constexpr std::uint64_t max_k() {
  if(K <= 4) {
    return 4;
  } else if(K <= 8) {
    return 8;
  } else if(K <= 16) {
    return 16;
  } else if(K <= 32) {
    return 32;
  } else {
    return 0;
  }
}	     

template<std::uint8_t K>
kmer_t<K> seq2bit(std::string seq) {
  kmer_t<K> kmer = 0;
  
  for(auto n: seq) {
    kmer <<= 2;
    kmer |= ((n >> 1) & 0b11);
  }
  
  return kmer;
}

template<std::uint8_t K>
kmer_t<K> reverse_2(kmer_t<K> kmer) {
  kmer_t<K> reversed = 0;

  for(kmer_t<K> i = 0; i < K-1; ++i) {
    reversed = (reversed ^ (kmer & 0b11)) << 2;
    kmer >>= 2;
  }
  
  return reversed ^ (kmer & 0b11);
}

template<std::uint8_t K>
uint8_t parity(kmer_t<K> kmer) {
  return std::bitset<K*2>(kmer).count() % 2; 
}
 
template<std::uint8_t K>
kmer_t<K> get_cannonical(kmer_t<K> kmer) {
  uint8_t cleaning_move = (max_k<K>() - K) * 2;

  if(parity<K>(kmer) == 0) {
    return kmer;
  } else {
    
    kmer_t<K> hash = (reverse_2<K>(kmer) ^ comp_mask<K>());
    hash <<= cleaning_move;
    hash >>= cleaning_move;
        
    return hash;
  }
}

template<std::uint8_t K>
kmer_t<K> hash_kmer(kmer_t<K> kmer) {
  return get_cannonical<K>(kmer) >> 1;
}

#include <iostream>
#include <vector>

int main(int argc, char* argv[]) {

  constexpr std::uint8_t k = 3; // must be odd and minus than 32 change test at line 119

  std::string seq = "AAACCCTTTGGG";

  // To increase maximal count change uint8_t, to unint16_t, uint32_t or uint64_t
  std::vector<std::uint8_t> kmer2count(1 << (k * 2 -1), 0); // 1 << n <-> 2^n

  for(size_t i = 0; i != seq.length() - k + 1; i++) {
    std::string sub_seq = seq.substr(i, k);
    
    uint32_t hash = hash_kmer<k>(seq2bit<k>(sub_seq));

    if(kmer2count[hash] != 255) {
      kmer2count[hash]++;
    }
  }
  
  for(size_t i = 0; i != kmer2count.size(); i++) {
    std::cout<<int(kmer2count[i])<<" ";
  }
  std::cout<<std::endl;
  
  return 0;
}
5/5 - [1 vote]




Conversion d'encodage pour fichier tabulé [Bash]

12.10.2018     Yo_O      utf8 encodage csv excel tableau données cli 

  A l'aide de iconv (https://en.wikipedia.org/wiki/Iconv), vous pouvez maintenant arranger n'importe quel fichier tabulé "mal encodé" (comprendre "pas encodé en UTF-8") venant de vos biologistes préférés en une petite ligne.
iconv -f iso-8859-1 -t utf-8 input_file_in_ISO8859-1.csv > output_file_in_UTF8.csv
5/5 - [1 vote]




From FastQ to Fasta format [Bash]

04.10.2018     mathildefog      bash fastq fasta 

  Transforme un ficher fastq en fichier fasta.
cat myFile.fastq | paste - - - - | sed 's/^@/>/g'| cut -f1-2 | tr '\t' '\n' > myFile.fasta
5/5 - [2 votes]




louvain [Python]

24.09.2018     Mathurin      louvain 

  version de louvain trouvé en python
import numpy as np
import os


def genlouvain(B, seed=None):
    '''
    The optimal community structure is a subdivision of the network into
    nonoverlapping groups of nodes which maximizes the number of within-group
    edges and minimizes the number of between-group edges.
    This function is a fast an accurate multi-iterative generalization of the
    louvain community detection algorithm. This function subsumes and improves
    upon modularity_[louvain,finetune]_[und,dir]() and additionally allows to
    optimize other objective functions (includes built-in Potts Model i
    Hamiltonian, allows for custom objective-function matrices).
    Parameters
    ----------
    B : str | NxN np.arraylike
        string describing objective function type, or provides a custom
        objective-function matrix. builtin values 'modularity' uses Q-metric
        as objective function, or 'potts' uses Potts model Hamiltonian.
        Default value 'modularity'.
    seed : int | None
        random seed. default value=None. if None, seeds from /dev/urandom.
    Returns
    -------
    ci : Nx1 np.array
        final community structure
    q : float
        optimized q-statistic (modularity only)
    '''
    np.random.seed(seed)
    st0 = np.random.get_state()

    n = len(B)
    ci = np.arange(n) + 1
    Mb = ci.copy()

    B = np.squeeze(np.asarray(B))
    B = (B + B.T)/2.0
    Hnm = np.zeros((n, n))
    for m in range(1, n + 1):
        Hnm[:, m - 1] = np.sum(B[:, ci == m], axis=1)  # node to module degree
    H = np.sum(Hnm, axis=1)  # node degree
    Hm = np.sum(Hnm, axis=0)  # module degree

    q0 = -np.inf
    # compute modularity
    q = np.sum(B[np.tile(ci, (n, 1)) == np.tile(ci, (n, 1)).T])

    first_iteration = True

    while q - q0 > 1e-10:
        it = 0
        flag = True
        while flag:
            it += 1
            if it > 1000:
                raise ValueError('Modularity infinite loop style G. '
                                    'Please contact the developer.')
            flag = False
            for u in np.random.permutation(n):
                ma = Mb[u] - 1
                dQ = Hnm[u, :] - Hnm[u, ma] + B[u, u]  # algorithm condition
                dQ[ma] = 0

                max_dq = np.max(dQ)
                if max_dq > 1e-10:
                    flag = True
                    mb = np.argmax(dQ)
                    Hnm[:, mb] += B[:, u]
                    Hnm[:, ma] -= B[:, u]  # change node-to-module strengths

                    Hm[mb] += H[u]
                    Hm[ma] -= H[u]  # change module strengths

                    Mb[u] = mb + 1

        _, Mb = np.unique(Mb, return_inverse=True)
        Mb += 1

        M0 = ci.copy()
        if first_iteration:
            ci = Mb.copy()
            first_iteration = False
        else:
            for u in range(1, n + 1):
                ci[M0 == u] = Mb[u - 1]  # assign new modules

        n = np.max(Mb)
        b1 = np.zeros((n, n))
        for i in range(1, n + 1):
            for j in range(i, n + 1):
                # pool weights of nodes in same module
                bm = np.sum(B[np.ix_(Mb == i, Mb == j)])
                b1[i - 1, j - 1] = bm
                b1[j - 1, i - 1] = bm
        B = b1.copy()

        Mb = np.arange(1, n + 1)
        Hnm = B.copy()
        H = np.sum(B, axis=0)
        Hm = H.copy()

        q0 = q
        q = np.trace(B) # compute modularity
        dir = 'output'
        if not os.path.isdir(dir): os.makedirs(dir)
        
        output = open (os.path.join(dir, 'gen_louvain_seed.txt'),'a+')
        print >> output, "%i" % (st0[1][0])


    return ci, q
0/5 - [0 vote]





Ancients snippets >>>
Powered by canSnippet CE - by ademcan