Recherche de tag: python


Parseur de csv basique [Python]

04.07.2018     qcavaille      python3 parser csv 

  Parseur de csv basique :
- Délimiteur réglable
- Pas de limite de colonnes, mais nécessite des lignes "complètes" ET une ligne d'en-têtes de colonnes
- Retourne un générateur, théoriquement plus léger en mémoire. Mais itérable une seule fois !
import csv as L
def readCSV(file):
    with open(file) as csvfile:
        readCSV = L.reader(csvfile, delimiter=";")
        firstRow=True
        columnName=[]
        for row in readCSV:
            if firstRow:
                firstRow=False
                for col in row:
                    columnName.append(col)
            else:
                aRow={}
                i=0
                for col in columnName:
                    aRow[col]=row[i]
                    i+=1
                yield aRow
2/5 - [3 ratings]




Parseur de fichier gff3 basique [Python]

08.07.2018     propan2one      gff3 parser python3 

  La fonction gff3parser() permet de parser un fichier au format .gff3 et de retourner un dictionnaire au format :

{ numéro de l'ORF : [start, stop, strand ], ... }

Les print en commentaire permettent d'afficher le contenue du gff3 sur le terminal

contentGFF = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
def gff3parser (filename, dictOrf):
	 """ parser de gff3 """
	with open(filename) as f:
		for line in f:
			if line.startswith("#"):
		                continue
		content = line.strip().split("\t")
		if len(content) == len(contentGFF):
			value = [content[3], content[4], content[6]]
			#print(value)
			#print(content[8])
			numORF = content[8].strip().split(";")
			#print(numORF)
			for product in numORF:
				if re.search(r"^product=ORF\d{0,3}", product):
					keys = product[11:]
					dictOrf[keys] = value

		return {}

# Usage :
orf = {}
gff3parser(gffname,orf) #avec gffname le nom du fichier gff3 à parser
1/5 - [1 rating]




Python cProfile as a decorator [Python]

12.07.2018     Plopp      python cprofile profiling profiler decorator 

  Quickly profile your functions with cProfile by using it as a decorator
import cProfile

def profileit(func):
    def profiled_func(*args, **kwargs):
        profile = cProfile.Profile()
        print("Profiling " + "\033[95m" + func.__name__ + "\033[0m") # colors
        try:
            profile.enable()
            result = func(*args, **kwargs)
            profile.disable()
            return result
        finally:
            profile.print_stats(sort="time")
    return profiled_func



##########
# Example #
##########

@profileit
def times2():
   for i in range(500000):
       j = i * 2
       j = add_one(j)

def add_one(n):
    return n+1

times2()
5/5 - [1 rating]




Average timeit [Python]

26.02.2019     Yo_O      timer python chronophage temps benchmark 

  Une fonction timeit permettant de placer @timeit en decorateur d'une methode à chronométrer.
Petit bonus : joue n fois la méthode décorée et rend le temps moyen.

source originale : https://github.com/realpython/materials/blob/master/pandas-fast-flexible-intuitive/tutorial/timer.py
import functools
import gc
import itertools
import sys
from timeit import default_timer as _timer


def timeit(_func=None, *, repeat=3, number=1000, file=sys.stdout):
    """Decorator: prints time from best of `repeat` trials.
    Mimics `timeit.repeat()`, but avg. time is printed.
    Returns function result and prints time.
    You can decorate with or without parentheses, as in
    Python's @dataclass class decorator.
    kwargs are passed to `print()`.
    >>> @timeit
    ... def f():
    ...     return "-".join(str(n) for n in range(100))
    ...
    >>> @timeit(number=100000)
    ... def g():
    ...     return "-".join(str(n) for n in range(10))
    ...
    """

    _repeat = functools.partial(itertools.repeat, None)

    def wrap(func):
        @functools.wraps(func)
        def _timeit(*args, **kwargs):
            # Temporarily turn off garbage collection during the timing.
            # Makes independent timings more comparable.
            # If it was originally enabled, switch it back on afterwards.
            gcold = gc.isenabled()
            gc.disable()

            try:
                # Outer loop - the number of repeats.
                trials = []
                for _ in _repeat(repeat):
                    # Inner loop - the number of calls within each repeat.
                    total = 0
                    for _ in _repeat(number):
                        start = _timer()
                        result = func(*args, **kwargs)
                        end = _timer()
                        total += end - start
                    trials.append(total)

                # We want the *average time* from the *best* trial.
                # For more on this methodology, see the docs for
                # Python's `timeit` module.
                #
                # "In a typical case, the lowest value gives a lower bound
                # for how fast your machine can run the given code snippet;
                # higher values in the result vector are typically not
                # caused by variability in Python’s speed, but by other
                # processes interfering with your timing accuracy."
                best = min(trials) / number
                print(
                    "Best of {} trials with {} function"
                    " calls per trial:".format(repeat, number)
                )
                print(
                    "Function `{}` ran in average"
                    " of {:0.3f} seconds.".format(func.__name__, best),
                    end="\n\n",
                    file=file,
                )
            finally:
                if gcold:
                    gc.enable()
            # Result is returned *only once*
            return result

        return _timeit

    # Syntax trick from Python @dataclass
    if _func is None:
        return wrap
    else:
        return wrap(_func)
5/5 - [1 rating]




recherche d une sequence mystere a l aide de biopython [Python]

04.04.2019     erwan06      biopython fasta arabette photosynthese 

  ce snippet reprend le code de recherche de la protéine codée vraisemblable comme cela est décrit sur le site http://arn16s.ovh (étape n°3) à partir de la séquence fasta "U91966". La protéine codée par l'une des six phases est alors affichée (Rubisco)
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

#on importe la sequence mystere avec les outils en ligne

def seq_codante_de_phase(phase):
    meilleure_seq=compteur=i=j=start=stop=0
    longueur = len(phase)-1
    for compteur in range (i,longueur):
        if (phase[i] != 'M'):
            i += 1
        else:
            start = i
            j = start
            while (phase[j] != '*' and j < len(phase)-1):
                j += 1
            if (phase[j] == "*"):
                stop = j
            seq_codante = phase[i:j]
            if (len(seq_codante) > meilleure_seq):
                meilleure_seq = len(seq_codante)
                meilleur_start = start
                meilleur_stop = stop
                proteine_de_phase = phase[meilleur_start+1:meilleur_stop]
            i += 1
    return(proteine_de_phase)        

#main

mysterious_sequence = SeqIO.read(open('my_sequence.fasta'),
'fasta',
alphabet=generic_dna).seq


phase_1 = mysterious_sequence[0::]
phase_2 = mysterious_sequence[1::]
phase_3 = mysterious_sequence[2::]

#on renverse la sequence mystere pour creer les 3 dernieres phases 
complement_sequence = mysterious_sequence.complement()
reverse_sequence = complement_sequence[::-1]

#sequence à partir du dernier nucleotide en sens contraire
phase_4 = reverse_sequence[0::]
#sequence à partir de l'avant-dernier nucleotide en sens contraire
phase_5 = reverse_sequence[1::]
#sequence à partir de l'antépénultième nucleotide en sens contraire 
phase_6 = reverse_sequence[2::]

prot_1 = str(phase_1.translate())
prot_2 = str(phase_2.translate())
prot_3 = str(phase_3.translate())
prot_4 = str(phase_4.translate())
prot_5 = str(phase_5.translate())
prot_6 = str(phase_6.translate())

liste=[seq_codante_de_phase(prot_1),seq_codante_de_phase(prot_2),seq_codante_de_phase(prot_3),seq_codante_de_phase(prot_3),seq_codante_de_phase(prot_4),seq_codante_de_phase(prot_5),seq_codante_de_phase(prot_6)]
sorted(liste, key=len)
#la phase vraisemblable est la plus longue, donc la première de la liste triee par longueur
print (liste[1])
0/5 - [0 rating]