# use a tabfor i inrange(3):print(i)# use 2 spacesfor i inrange(3):print(i)# use 4 spacesfor i inrange(3):print(i)
Best practice: always use 4 spaces. You can set whether to use spaces(soft tabs) or tabs for indentation.
In vim editor, use :set list to inspect incorrect number of spaces/tabs.
Add Shebang and encoding at the beginning of executable scripts
Create a file named welcome.py
#! /usr/bin/env python# -*- coding: UTF-8 -*-print('welcome to python!')
Then set the python script as executable:
chmod+xwelcome.py
Now you can run the script without specifying the Python interpreter:
./welcome.py
All variables, functions, classes are dynamic objects
classMyClass():def__init__(self,name): self.name = name# assign an integer to aa =1print(type(a))# assign a string to aa ='abc'print(type(a))# assign a function to aa =rangeprint(type(a))print(a(10))# assign a class to aa = MyClassprint(type(a))b =a('myclass')print(b.name)# assign an instance of a class to aa =MyClass('myclass')print(b.name)# get type of aprint(type(a))
All python variables are pointers/references
a = [1,2,3]print('a = ', a)# this add another refrence to the listb = aprint('b = ', b)# this will change contents of both a and bb[2]=4print('a = ', a)print('b = ', b)
Use deepcopy if you really want to COPY a variable
from copy import deepcopya ={'A': [1],'B': [2],'C': [3]}print(a)# shallow copyb =dict(a)# modify elements of b will change contents of ab['A'].append(2)print('a = ', a)print('b = ', b)# this also does not workc ={k:v for k, v in a}c['A'].append(3)print('a = ', a)print('c = ', c)# recurrently copy every object of ad =deepcopy(a)# modify elements of c will not change contents of ad['A'].append(2)print('a = ', a)print('d = ', d)
What if I accidentally overwrite my builtin functions?
A = [1,2,3,4]# Ops! the builtin function sum is overwritten by a numbersum=sum(A)# this will raise an error because sum is not a function nowprint(sum(A))# recover the builtin function into the current environmentfrom __builtin__ importsum# this works because sum is a functionprint(sum(A))
Note: in Python 3, you should import from builtins rather than __builtin__
from builtins importsum
int is of arbitrary precision in Python!
In Pyhton:
print(2**10000)
In R:
print(2^10000)
Easiest way to swap values of two variables
In C/C++:
int a =1, b =2, t;t = a;a = b;b = t;
In Python:
a =1b =2b, a = a, bprint(a, b)
List comprehension
Use for-loops:
a = []for i inrange(10): a.append(i +10)print(a)
Use list comprehension
a = [i +10for i inrange(10)]print(a)
Dict comprehension
Use for-loops:
a ={}for i inrange(10): a[i]=chr(ord('A') + i)print(a)
Use dict comprehension:
a ={i:chr(ord('A') + i)for i inrange(10)}print(a)
For the one-liners
Use ';' instead of '\n':
# print the first column of each linepython-c'import sys; print("\n".join(line.split("\t")[0] for line in sys.stdin))'
import sys# read line by linefor line in sys.stdin:print(line)
Order of dict keys are NOT as you expected
a ={'A':1,'B':2,'C':3,'D':4,'E':5,'F':6}# not in lexicographical orderprint([key for key in a])# now in lexicographical orderprint([key for key insorted(a)])
Use enumerate() to add a number during iteration
A = ['a','b','c','d']for i, a inenumerate(A):print(i, a)
a ='ABCDF'# will raise an Errora[4]='E'# convert str to bytearrayb =bytearray(a)# bytearray are mutableb[4]='E'# convert bytearray to strprint(str(b))
tuples are hashable while lists are not hashable
# create dict using tuples as keysd ={ ('chr1',1000,2000):'featureA', ('chr1',2000,3000):'featureB', ('chr1',3000,4000):'featureC', ('chr1',4000,5000):'featureD', ('chr1',5000,6000):'featureE', ('chr1',6000,7000):'featureF'}# query the dict using tuplesprint(d[('chr1', 3000, 4000)])print(d[('chr1', 6000, 7000)])# will raise an errord ={['chr1',1000,2000]:'featureA'}
Use itertools
Nested loops in a more concise way:
A = [1,2,3]B = ['a','b','c']C = ['i','j','k']D = ['x','y','z']# Use nested for-loopsfor a in A:for b in B:for c in C:for d in D:print(a, b, c, d)# Use itertools.productimport itertoolsfor a, b, c, d in itertools.product(A, B, C, D):print(a, b, c, d)
Get all combinations of a list:
A = ['A','B','C','D']# Use itertools.combinationsimport itertoolsfor a, b, c in itertools.combinations(A, 3):print(a, b, c)
Convert iterables to lists
import itertoolsA = [1,2,3]B = ['a','b','c']a = itertools.product(A, B)# a is a iterable rather than a listprint(a)# a is a list nowa =list(a)print(a)
Use the zip() function to transpose nested lists/tuples/iterables
records = [ ('chr1',1000,2000), ('chr1',2000,3000), ('chr1',3000,4000), ('chr1',4000,5000), ('chr1',5000,6000), ('chr1',6000,7000)]# iterate by rowsfor chrom, start, end in records:print(chrom, start, end)# extract columnschroms, starts, ends =zip(*records)# build records from columns# now records2 is the same as recordsrecords2 =zip(chroms, starts, ends)print(records)
Global and local variables
# a is globala =1defprint_local():# a is local a =2print(a)defprint_global():# a is globalglobal a a =2print(a)# print global variableprint(a)# print local variable from functionprint_local()# a is unchangedprint(a)# change and print global from functionprint_global()# a is changedprint(a)
Use defaultdict
Use dict:
d ={}d['a']= []d['b']= []d['c']= []# extend list with new elementsd['a']+= [1,2]d['b']+= [3,4,5]d['c']+= [6]for key, val in d.items():print(key, val)
Use defaultdict:
from collections import defaultdict# a new list is created automatically when new elements are addedd =defaultdict(list)# extend list with new elementsd['a']+= [1,2]d['b']+= [3,4,5]d['c']+= [6]for key, val in d.items():print(key, val)
Use generators
Example: read a large FASTA file
defappend_extra_line(f):"""Yield an empty line after the last line in the file """for line in f:yield lineyield''defread_fasta(filename):withopen(filename, 'r')as f: name =None seq =''for line inappend_extra_line(f):if line.startswith('>')or (len(line)==0):if (len(seq)>0) and (name isnotNone):yield (name, seq)if line.startswith('>'): name = line.strip()[1:].split()[0] seq =''else:if name isNone:raiseValueError('the first line does not start with ">"') seq += line.strip()# print sequence name and length of each for name, seq inread_fasta('test.fa'):print(name, len(seq))
Turn off annoying KeyboardInterrupt and BrokenPipe Error
import numpy as np# create an empty matrix of shape (5, 4)X = np.zeros((5, 4), dtype=np.int32)# create an array of length 5: [0, 1, 2, 3, 4]y = np.arange(5)# create an array of length 4: [0, 1, 2, 3]z = np.arange(4)# set Row 1 to [0, 1, 2, 3]X[0]= np.arange(4)# set Row 2 to [1, 1, 1, 1]X[1]=1# add 1 to all elementsX +=1# add y to each row of XX += y.reshape((-1, 1))# add z to each column of XX += z.reshape((1, -1))# get row sums => row_sums = X.sum(axis=1)# get column sumscol_sums = X.sum(axis=0)# matrix multiplicationA = X.dot(X.T)# save matrix to text filenp.savetxt('data.txt', A)
Numerical analysis (probability distribution, signal processing, etc.): scipy
Compile python for-loops to native code to achive similar performance to C/C++ code. Example:
from numba import jitfrom numpy import arange# jit decorator tells Numba to compile this function.# The argument types will be inferred by Numba when function is called.@jitdefsum2d(arr): M, N = arr.shape result =0.0for i inrange(M):for j inrange(N): result += arr[i,j]return resulta =arange(9).reshape(3,3)print(sum2d(a))
import pandas as pd# read a bed filegenes = pd.read_table('gene.bed', header=None, sep='\t', names=('chrom', 'start', 'end', 'gene_id', 'score', 'strand', 'biotype'))# get all gene IDsgene_ids = genes['gene_id']# set gene_id as indexgenes.index = genes['gene_id']# get row with given gene_idgene = genes.loc['ENSG00000212325.1']# get rows with biotype = 'protein_coding'genes_selected = genes[genes['biotype']=='protein_coding']]# get protein coding genes in chr1genes_selected = genes.query('(biotype == "protein_coding") and (chrom == "chr1")')# count genes for each biotypebiotype_counts = genes.groupby('biotype')['gene_id'].count()# add a column for gene lengthgenes['length']= genes['end']- genes['start']# calculate average gene length for each chromosome and biotypelength_table = genes.pivot_table(values='length', index='biotype', columns='chrom')# save DataFrame to Excel filelength_table.to_excel('length_table.xlsx')
from sklearn.datasets import make_classificationfrom sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import roc_auc_score, accuracy_score# generate ramdom dataX, y =make_classification(n_samples=1000, n_classes=2, n_features=10)# split dataset into training and test datasetX_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2)# create an classifier objectmodel =LogisticRegression()# training the classifiermodel.fit(X_train, y_train)# predict outcomes on the test datasety_pred = model.predict(X_test)# evalualte the classification performanceprint('roc_auc_score = %f'%roc_auc_score(y_test, y_pred))print('accuracy_score = %f'%accuracy_score(y_test, y_pred))
import h5py
import numpy as np
# generate data
chroms = ['chr1', 'chr2', 'chr3']
chrom_sizes = {
'chr1': 15000,
'chr2': 12000,
'chr3': 11000
}
coverage = {}
counts = {}
for chrom, size in chrom_sizes.items():
coverage[chrom] = np.random.randint(10, 1000, size=size)
counts[chrom] = np.random.randint(1000, size=size)%coverage[chrom]
# save data to an HDF5 file
with h5py.File('dataset.h5', 'w') as f:
for chrom in chrom_sizes:
g = f.create_group(chrom)
g.create_dataset('coverage', data=coverage[chrom])
g.create_dataset('counts', data=counts[chrom])
h5ls -r dataset.h5
/ Group
/chr1 Group
/chr1/counts Dataset {15000}
/chr1/coverage Dataset {15000}
/chr2 Group
/chr2/counts Dataset {12000}
/chr2/coverage Dataset {12000}
/chr3 Group
/chr3/counts Dataset {11000}
/chr3/coverage Dataset {11000}
Read data from an HDF file:
import h5py
# read data from an HDF5 file
with h5py.File('dataset.h5', 'r') as f:
coverage = {}
counts = {}
for chrom in f.keys():
coverage[chrom] = f[chrom + '/coverage'][:]
counts[chrom] = f[chrom + '/counts'][:]
import numpy as np
cimport numpy as np
cimport cython
from cython.parallel import prange
from cython.parallel cimport parallel
cimport openmp
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
def compute_mse_grad_linear_ard(np.ndarray[np.float64_t, ndim=1] w,
np.ndarray[np.float64_t, ndim=2] X1,
np.ndarray[np.float64_t, ndim=2] X2,
np.ndarray[np.float64_t, ndim=2] Kinv1,
np.ndarray[np.float64_t, ndim=2] K2,
np.ndarray[np.float64_t, ndim=2] a,
np.ndarray[np.float64_t, ndim=2] err,
np.ndarray[np.float64_t, ndim=2] mask=None):
'''Compute the gradients of MSE on the test samples with respect to relevance vector w.
:param w: 1D array of shape [n_features]
:return: gradients of MSE wrt. 2, 1D array of shape [n_features]
'''
cdef np.int64_t N1, N2, p
cdef np.int64_t k, i, j, m
N1 = X1.shape[0]
N2 = X2.shape[0]
p = X2.shape[1]
cdef np.ndarray[np.float64_t, ndim=2] K2Kinv1 = K2.dot(Kinv1)
cdef np.ndarray[np.float64_t, ndim=1] mse_grad = np.zeros_like(w)
#cdef np.ndarray[np.float64_t, ndim=3] K1_grad = np.zeros((p, N1, N1), dtype=np.float64)
#cdef np.ndarray[np.float64_t, ndim=3] K2_grad = np.zeros((p, N2, N1), dtype=np.float64)
#cdef np.ndarray[np.float64_t, ndim=3] K_grad = np.zeros((p, N2, N1), dtype=np.float64)
cdef np.int64_t max_n_threads = openmp.omp_get_max_threads()
cdef np.ndarray[np.float64_t, ndim=3] K1_grad = np.zeros((max_n_threads, N1, N1), dtype=np.float64)
cdef np.ndarray[np.float64_t, ndim=3] K2_grad = np.zeros((max_n_threads, N2, N1), dtype=np.float64)
cdef np.ndarray[np.float64_t, ndim=3] K_grad = np.zeros((max_n_threads, N1, N1), dtype=np.float64)
cdef np.int64_t thread_id
with nogil, parallel():
for k in prange(p):
thread_id = openmp.omp_get_thread_num()
# compute K1_grad
for i in range(N1):
for j in range(N1):
K1_grad[thread_id, i, j] = 2.0*w[k]*X1[i, k]*X1[j, k]
# compute K2_grad
for i in range(N2):
for j in range(N1):
K2_grad[thread_id, i, j] = 2.0*w[k]*X2[i, k]*X1[j, k]
# compute K_grad
for i in range(N2):
for j in range(N1):
K_grad[thread_id, i, j] = K2_grad[thread_id, i, j]
for m in range(N1):
K_grad[thread_id, i, j] += K2Kinv1[i, m]*K1_grad[thread_id, m, j]
# compute mse_grad
for i in range(N2):
for j in range(N1):
mse_grad[k] += err[i, 0]*K_grad[thread_id, i, j]*a[j, 0]
return mse_grad, K_grad
File accession File format Output type Experiment accession Assay Biosample term id
ENCFF983DFB fastq reads ENCSR429XTR ChIP-seq EFO:0002067
ENCFF590TBW fastq reads ENCSR429XTR ChIP-seq EFO:0002067
ENCFF258RWG bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF468LRV bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF216EBS bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF232QFN bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF682NGE bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF328UKA bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF165COO bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF466OLG bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF595HIY bigBed narrowPeak peaks ENCSR429XTR ChIP-seq EFO:0002067
ENCFF494CKB bigWig fold change over control ENCSR429XTR ChIP-seq EFO:0002067
ENCFF308BXW bigWig fold change over control ENCSR429XTR ChIP-seq EFO:0002067
ENCFF368IHM bed narrowPeak peaks ENCSR429XTR ChIP-seq EFO:0002067
Now display the table more clearly:
head -n 15 metadata.tsv | tvi -d $'\t' -j center
Output:
File accession File format Output type Experiment accession Assay Biosample term id
ENCFF983DFB fastq reads ENCSR429XTR ChIP-seq EFO:0002067
ENCFF590TBW fastq reads ENCSR429XTR ChIP-seq EFO:0002067
ENCFF258RWG bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF468LRV bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF216EBS bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF232QFN bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF682NGE bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF328UKA bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF165COO bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF466OLG bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF595HIY bigBed narrowPeak peaks ENCSR429XTR ChIP-seq EFO:0002067
ENCFF494CKB bigWig fold change over control ENCSR429XTR ChIP-seq EFO:0002067
ENCFF308BXW bigWig fold change over control ENCSR429XTR ChIP-seq EFO:0002067
ENCFF368IHM bed narrowPeak peaks ENCSR429XTR ChIP-seq EFO:0002067
You can also get some help by typing tvi -h:
usage: tvi [-h] [-d DELIMITER] [-j {left,right,center}] [-s SEPARATOR]
[infile]
Print tables pretty
positional arguments:
infile input file, default is stdin
optional arguments:
-h, --help show this help message and exit
-d DELIMITER delimiter of fields of input. Default is white space.
-j {left,right,center}
justification, either left, right or center. Default
is left
-s SEPARATOR separator of fields in output
tvi.py
#! /usr/bin/env python
import sys
import argparse
import os
from cStringIO import StringIO
def main():
parser = argparse.ArgumentParser(description='Print tables pretty')
parser.add_argument('infile', type=str, nargs='?',
help='input file, default is stdin')
parser.add_argument('-d', dest='delimiter', type=str,
required=False,
help='delimiter of fields of input. Default is white space.')
parser.add_argument('-j', dest='justify', type=str,
required=False, default='left',
choices=['left', 'right', 'center'],
help='justification, either left, right or center. Default is left')
parser.add_argument('-s', dest='separator', type=str,
required=False, default=' ',
help='separator of fields in output')
args = parser.parse_args()
table = []
maxwidth = []
# default is to read from stdin
fin = sys.stdin
if args.infile:
try:
fin = open(args.infile, 'rt')
except IOError as e:
sys.stderr.write('Error: %s: %s\n'%(e.strerror, args.infile))
sys.exit(e.errno)
for line in fin:
fields = None
# split line by delimiter
if args.delimiter:
fields = line.strip().split(args.delimiter)
else:
fields = line.strip().split()
for i in xrange(len(fields)):
width = len(fields[i])
if (i+1) > len(maxwidth):
maxwidth.append(width)
else:
if width > maxwidth[i]:
maxwidth[i] = width
table.append(fields)
fin.close()
try:
for fields in table:
line = StringIO()
for i in xrange(len(fields)):
# format field with different justification
nSpace = maxwidth[i] - len(fields[i])
if args.justify == 'left':
line.write(fields[i])
for j in xrange(nSpace):
line.write(' ')
elif args.justify == 'right':
for j in xrange(nSpace):
line.write(' ')
line.write(fields[i])
elif args.justify == 'center':
for j in xrange(nSpace/2):
line.write(' ')
line.write(fields[i])
for j in xrange(nSpace - nSpace/2):
line.write(' ')
line.write(args.separator)
print line.getvalue()
line.close()
except IOError:
sys.exit(-1)
if __name__ == '__main__':
main()
Generate a random FASTA file
seqgen.py
#! /usr/bin/env python
import sys
import argparse
import textwrap
import random
def write_fasta(fout, seq, name='seq', description=None):
if description:
fout.write('>' + name + ' ' + description + '\n')
else:
fout.write('>' + name + '\n')
fout.write(textwrap.fill(seq) + '\n')
def main():
parser = argparse.ArgumentParser(description='Generate sequences and output in various formats')
parser.add_argument('-n', '--number', dest='number', type=int, required=False,
default=10, help='Number of sequences to generate')
parser.add_argument('--min-length', dest='min_length', type=int, required=False,
default=30, help='Minimal length')
parser.add_argument('--max-length', dest='max_length', type=int, required=False,
default=50, help='Maximal length')
parser.add_argument('-l', '--length', type=int, required=False,
help='Fixed length. If specified, --min-length and --max-length will be ignored.')
parser.add_argument('-a', '--alphabet', type=str, required=False,
default='ATGC', help='Letters to used in the sequences')
parser.add_argument('-f', '--format', type=str, required=False,
choices=['fasta', 'text'], default='fasta', help='Output formats')
parser.add_argument('-o', '--outfile', type=argparse.FileType('w'), required=False,
default=sys.stdout, help='Output file name')
parser.add_argument('-p', '--prefix', type=str, required=False,
default='RN_', help='Prefix of sequence names for fasta format')
args = parser.parse_args()
rand = random.Random()
for i in xrange(args.number):
if args.length:
length = args.length
else:
length = rand.randint(args.min_length, args.max_length)
seq = bytearray(length)
for j in xrange(length):
seq[j] = rand.choice(args.alphabet)
if args.format == 'fasta':
write_fasta(args.outfile, str(seq), args.prefix + '%08d'%i)
else:
args.outfile.write(seq + '\n')
args.outfile.close()
if __name__ == '__main__':
main()
Weekly tasks
All files you need for completing the tasks can be found at: weekly_tasks.zip
Task 1: run examples (Python tips, numpy, pandas) in this tutorial
Install Anaconda on your PC. Try to understand example code and run in Jupyter or IPython.
Task 2: write a Python program to convert a GTF file to BED12 format