Recherche de tag: python
Parseur de csv basique [Python]

- Délimiteur réglable
- Pas de limite de colonnes, mais nécessite des lignes "complètes" ET une ligne d'en-têtes de colonnes
- Retourne un générateur, théoriquement plus léger en mémoire. Mais itérable une seule fois !
import csv as L
def readCSV(file):
with open(file) as csvfile:
readCSV = L.reader(csvfile, delimiter=";")
firstRow=True
columnName=[]
for row in readCSV:
if firstRow:
firstRow=False
for col in row:
columnName.append(col)
else:
aRow={}
i=0
for col in columnName:
aRow[col]=row[i]
i+=1
yield aRow
2/5 - [3 ratings]
Parseur de fichier gff3 basique [Python]

{ numéro de l'ORF : [start, stop, strand ], ... }
Les print en commentaire permettent d'afficher le contenue du gff3 sur le terminal
contentGFF = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
def gff3parser (filename, dictOrf):
""" parser de gff3 """
with open(filename) as f:
for line in f:
if line.startswith("#"):
continue
content = line.strip().split("\t")
if len(content) == len(contentGFF):
value = [content[3], content[4], content[6]]
#print(value)
#print(content[8])
numORF = content[8].strip().split(";")
#print(numORF)
for product in numORF:
if re.search(r"^product=ORF\d{0,3}", product):
keys = product[11:]
dictOrf[keys] = value
return {}
# Usage :
orf = {}
gff3parser(gffname,orf) #avec gffname le nom du fichier gff3 à parser
1/5 - [1 rating]
Python cProfile as a decorator [Python]

import cProfile
def profileit(func):
def profiled_func(*args, **kwargs):
profile = cProfile.Profile()
print("Profiling " + "\033[95m" + func.__name__ + "\033[0m") # colors
try:
profile.enable()
result = func(*args, **kwargs)
profile.disable()
return result
finally:
profile.print_stats(sort="time")
return profiled_func
##########
# Example #
##########
@profileit
def times2():
for i in range(500000):
j = i * 2
j = add_one(j)
def add_one(n):
return n+1
times2()
5/5 - [1 rating]
Average timeit [Python]

Petit bonus : joue n fois la méthode décorée et rend le temps moyen.
source originale : https://github.com/realpython/materials/blob/master/pandas-fast-flexible-intuitive/tutorial/timer.py
import functools
import gc
import itertools
import sys
from timeit import default_timer as _timer
def timeit(_func=None, *, repeat=3, number=1000, file=sys.stdout):
"""Decorator: prints time from best of `repeat` trials.
Mimics `timeit.repeat()`, but avg. time is printed.
Returns function result and prints time.
You can decorate with or without parentheses, as in
Python's @dataclass class decorator.
kwargs are passed to `print()`.
>>> @timeit
... def f():
... return "-".join(str(n) for n in range(100))
...
>>> @timeit(number=100000)
... def g():
... return "-".join(str(n) for n in range(10))
...
"""
_repeat = functools.partial(itertools.repeat, None)
def wrap(func):
@functools.wraps(func)
def _timeit(*args, **kwargs):
# Temporarily turn off garbage collection during the timing.
# Makes independent timings more comparable.
# If it was originally enabled, switch it back on afterwards.
gcold = gc.isenabled()
gc.disable()
try:
# Outer loop - the number of repeats.
trials = []
for _ in _repeat(repeat):
# Inner loop - the number of calls within each repeat.
total = 0
for _ in _repeat(number):
start = _timer()
result = func(*args, **kwargs)
end = _timer()
total += end - start
trials.append(total)
# We want the *average time* from the *best* trial.
# For more on this methodology, see the docs for
# Python's `timeit` module.
#
# "In a typical case, the lowest value gives a lower bound
# for how fast your machine can run the given code snippet;
# higher values in the result vector are typically not
# caused by variability in Python’s speed, but by other
# processes interfering with your timing accuracy."
best = min(trials) / number
print(
"Best of {} trials with {} function"
" calls per trial:".format(repeat, number)
)
print(
"Function `{}` ran in average"
" of {:0.3f} seconds.".format(func.__name__, best),
end="\n\n",
file=file,
)
finally:
if gcold:
gc.enable()
# Result is returned *only once*
return result
return _timeit
# Syntax trick from Python @dataclass
if _func is None:
return wrap
else:
return wrap(_func)
5/5 - [1 rating]

import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
#on importe la sequence mystere avec les outils en ligne
def seq_codante_de_phase(phase):
meilleure_seq=compteur=i=j=start=stop=0
longueur = len(phase)-1
for compteur in range (i,longueur):
if (phase[i] != 'M'):
i += 1
else:
start = i
j = start
while (phase[j] != '*' and j < len(phase)-1):
j += 1
if (phase[j] == "*"):
stop = j
seq_codante = phase[i:j]
if (len(seq_codante) > meilleure_seq):
meilleure_seq = len(seq_codante)
meilleur_start = start
meilleur_stop = stop
proteine_de_phase = phase[meilleur_start+1:meilleur_stop]
i += 1
return(proteine_de_phase)
#main
mysterious_sequence = SeqIO.read(open('my_sequence.fasta'),
'fasta',
alphabet=generic_dna).seq
phase_1 = mysterious_sequence[0::]
phase_2 = mysterious_sequence[1::]
phase_3 = mysterious_sequence[2::]
#on renverse la sequence mystere pour creer les 3 dernieres phases
complement_sequence = mysterious_sequence.complement()
reverse_sequence = complement_sequence[::-1]
#sequence à partir du dernier nucleotide en sens contraire
phase_4 = reverse_sequence[0::]
#sequence à partir de l'avant-dernier nucleotide en sens contraire
phase_5 = reverse_sequence[1::]
#sequence à partir de l'antépénultième nucleotide en sens contraire
phase_6 = reverse_sequence[2::]
prot_1 = str(phase_1.translate())
prot_2 = str(phase_2.translate())
prot_3 = str(phase_3.translate())
prot_4 = str(phase_4.translate())
prot_5 = str(phase_5.translate())
prot_6 = str(phase_6.translate())
liste=[seq_codante_de_phase(prot_1),seq_codante_de_phase(prot_2),seq_codante_de_phase(prot_3),seq_codante_de_phase(prot_3),seq_codante_de_phase(prot_4),seq_codante_de_phase(prot_5),seq_codante_de_phase(prot_6)]
sorted(liste, key=len)
#la phase vraisemblable est la plus longue, donc la première de la liste triee par longueur
print (liste[1])
0/5 - [0 rating]