Commit 330c2f6b authored by Liu, Kevin's avatar Liu, Kevin
Browse files

Removing unneeded.

parent 22fa653b
Intro:
This directory should contain everything you need in order to do two things:
1. Run simulation studies on SERES local tree annotation
2. Run emperical studies
Some of the software included depends on the ete3 python library for doing tree
operations. It is available at etetoolkit.org. It can be installed via anaconda
or via pip.
Setup:
The only thing you should need to do to get started is add the included bin
directory to your path.
export PATH="path_of_bin:$PATH"
File Conventions:
This software uses a few different kinds of files.
.fasta Self explanatory, MSA
.pos Posterior log
.walk Log from resampler, shows where the resampler turns around.
Used to translate positons back.
#!/usr/bin/env python
from ete3 import Tree
from sys import argv
#First, open the file the user specified
infile = open(argv[1])
#For each line in the file (genetree) strip off the partition length
tree_strings = []
for line in infile:
tree_strings.append(line[line.find(']') + 1:]);
#For each tree string, read a tree using ete3
trees = []
for tree_string in tree_strings:
trees.append(Tree(tree_string));
#Unroot all the trees
for index in range(len(trees)):
trees[index].unroot()
#Function which checks a list of trees for topological breakpoints
def has_topo_breakpoint(trees):
for i in range(0,len(trees) - 1):
if(trees[i].robinson_foulds(trees[i+1], unrooted_trees=True)[0] != 0):
return True
return False
#Run the function and exit with the appropriate status code
if(has_topo_breakpoint(trees)):
print("A topological breakpoint was found")
exit(0)
else:
print("A topological breakpoint was NOT found")
exit(1)
moddedrechmm/runTraining.py
\ No newline at end of file
#!/bin/bash
# This is a simple script which simplifys the interaction with the modified
# version of recHMM.
hmm $1 -k $2
from numpy import *
from extra_functions import reversed
def add_logs(x,y):
"A fast way to add logarithms"
if not x==0 and not y==0:
return x+log(1+exp(y-x))
elif x==0 and y==0:
print 'problem with log values'
return None
elif x==0:
return y
elif y==0:
return x
### Some example parameters to test accuray: ###
##transitions=array([[.5,.5],[.5,.5]])
##emissions=array([[.25,.25,.25,.25],[.08,.01,.01,.9]])
##seq=[0,0,2,3]
##begin_transition=[.6,.4]
def Forward(transitions, emissions, M):
sequence=range(M)
K=len(transitions[0,:])
global begin_transition #eventually change this??
begin_transition=[1/float(K) for k in range(K)]
f={}
for i in range(-1,M):
for l in range(K):
f[l,i]=0
if i==-1:
f[l,i]=log(begin_transition[l])
else:
for k in range(K):
f[l,i]=add_logs(f[l,i], f[k,i-1]+log(transitions[k,l]))
f[l,i]+=emissions[l,sequence[i]]
seqprob=0
for k in range(K):
seqprob=add_logs(seqprob, f[k,M-1]+log(begin_transition[k]))
# print(seqprob)
return f, seqprob
def Backward(transitions, emissions, M):
sequence=range(M)
K=len(transitions[0,:])
b={}
for i in reversed(range(M)):
for k in range(K):
if i==M-1:
b[k,M-1]=log(begin_transition[k])
else:
b[k,i]=0
for l in range(K):
b[k,i]=add_logs(b[k,i], log(transitions[k,l])+emissions[l,sequence[i+1]]+b[l,i+1])
b_seqprob=0
for l in range(K):
b_seqprob=add_logs(b_seqprob, b[l,0]+log(begin_transition[l])+emissions[l,sequence[0]])
return b, b_seqprob
def viterbi(data, init,trans,emit):
"computes viterbi sequence for a given observed sequence"
inferred_states=[]
K=len(init)
for obs in range(len(data[1,:])):
Probs={}
for state in range(K):
if inferred_states==[]:
Probs[state]=init[state]*exp(emit[state,obs])
else:
Probs[state]=trans[inferred_states[-1],state]*exp(emit[state,obs])
highest_probability=max(Probs.values())
for state in range(K):
if Probs[state]==highest_probability:
next_hidden_state=state
break
inferred_states.append(next_hidden_state)
return inferred_states
INSTALLATION
============
Since recHMM is implemented as a series of python modules,
installation is fairly easy, provided that you have python
along with NumPy and RPy on your system.
The only required installation step is summarized in the
Makefile in the main directory. The C++ extension package
uses SWIG to generate the in-between code which directs how
python and the C++ code interact, convert objects, etc.
Dependencies
==================
For recHMM to work correctly you will need to ensure that:
- Python is installed on your system. Most unix-like systems have this. If you have
older than python2.3, some things may not work, and you can upgrade at www.python.org
- The packages Numpy and Rpy are installed. Numpy is used extensively, whereas Rpy
is only used for plotting, and it's possible to not have Rpy installed if one doesn't
request plotting of posterior distributions. These can be downloaded for free:
http://numpy.scipy.org/
http://rpy.sourceforge.net/
-The C++ extension package is installed correctly.
recHMM: Detection of recombination with HMMs
===============================================================================
The primary reference point for recHMM is the following URL:
http://biowiki.org/RecHMM
===============================================================================
INSTALLATION
============
Type "make all".
See INSTALL file for more info.
===============================================================================
NOTES
=====
For help, type recHMM -h or see http://biowiki.org/RecHMM
The package was written by Oscar Westesson <breadbaron@berkeley.edu> while
in Ian Holmes's <ihh@berkeley.edu> group at UC Berkeley, Dept of Bioengineering.
===============================================================================
Attached below is software implementing RecHMM, an HMM-based method for computing local-phylogeny-switching breakpoints. Usage: ./runTraining.py <FASTA input alignment> -lb -prefix <empty existing working directory>/ -k <number of hidden states> -lt
Notes:
- Make sure to include the backslash after the -prefix option.
- Use 2 states for the -k option, corresponding to the two parental trees for our model network.
### Parse a fasta file into a dictionary of sequences ###
import sys
def get_fasta(fasta_input, log=True):
if isinstance(fasta_input, (file)):
fasta = fasta_input
elif isinstance(fasta_input, (str)):
fasta = open(fasta_input)
else:
print type(fasta_input), ': unknown filetype!'
alignmentStringList=[line.strip() for line in fasta.readlines()]
fasta.close()
alignDict={}
current_taxa=None
for line in alignmentStringList:
if len(line)==0:
continue
elif line[0]=='>':
if log:
print "New taxa:", line[1:]
alignDict[line[1:]]=''
current_taxa=line[1:]
else:
if log:
print "Adding to taxa:", current_taxa
alignDict[current_taxa]+=line
# else:
# print "Strange character in input file:", line
return alignDict
#a=get_fasta(raw_input('Enter filename: '), log=True)
#print "Taxa:", a.keys()
#print "Sequences: ", a.values()
#get_fasta(sys.stdin)
>2
TAATTAAATCGATTTTGGGGCTCAGCCAGTCCTCTTGAACTTGAACTGACGATACCAAGGTTCTTGGGCGTCCGATTGTGCGGACTCCCAGTTCGGCCTTATATCCCGAAGCTTACAATGCGTAAGCATTTCGGTGACGAATATCGCTTTTAACCTGCCCCCCCCATGATATGTCAGGTCCTACCCGAAAGTAGGGCGTAGCAATGGTAAGCCACCCGAGTGCGCAACTGCGACACTGAACCATACGGACCATTTCACTGAGCTCTGTTCCGGCGCAGCACGACGGATCCCGATAACCCTTTAACGAGCTGGGCCATCGGCTCAAAGCAGAGGGAACCGGTCTTCAGGGGAAGGGATGCAGGCATCTTCGTACGCAAAGCTTGTCTGGATCGGAATTAAAACAGGCAAGTCTTGATATCACGCATGTTTCATGAACCCCGTGTGAGGGATGCCAGCCTTTCCTTAGCCCTCTCCACTCGTGTATCTTTTACTCTTAAGTATAGACATGAAATAAGATCCATACCGGTATAATCCCTTTGCTAATTGGTCCGAAATCTAACAGATAATCGTGCCTTTGGCAGCGGTTCTATTACGACCGGTATTCTATGAATAATCACTAGGCGCATAGCCTATATACTGCAGGGGTGACTTATCCCAACAGGGAGGAGGCTTGTTAAGTCGAGCCCTGACGCATTCGCAATGGCACCTTAGTTGGATGCAAAGGGACACCCCAGGTCGATGCAGCGTGTCGACCGGTGGTACTTCAATGGCCTTTCCGCTTATGTCCCTTGCTCTGGATGATCATTGAGTTACTGCGACCTGTTGCAGCCGACGATCCCGCCCAGGTATTTAACGACATGAATTAATCGACATAAGCCGACTGAGCAGTATAGCTGGGTAGAATCAGTGCTCGACCAGGAGGCTGCCTCGTACTTCCGTTGAACCGATCGCACCTAGATACGTTAGTGCTTTGAGCGGATACTCTCATATAATGGGACCA
>4
TCAACTCAGAGCTGTGCAGTGTCAGCCATCTTTCTCTTAGTTAAGATTTTCTTCCCTGCGTTCAGGTGTTTCAGAGTTGACAGGAACGGATTGCGGGATTATATTAAGAACGTGACAAGCTGCGGTGAGCACAAACAAAGCCGCCGCCGATCACCTGCCCAAGCCATGTCTCCTGACTGCATTCGGCCGAGCTGGTCCATGCGTAAAGTATTCCCCTGAGCGCACCACTTATACCGTGAACCACACCAGTGGATGCTAAGCCCCTACTGTTGAAGACTTACGACGTAATATAGCGCTCCACTAAAAGAACACTCTATGCGCTTAATCCGTTTAGTTACCCGAGTTTGGCAAAGCGACTTCAGGTATCATGAGCGCAACTTATAATGCCTTAATCTCAAACATCCTATATTCTGGCCACCAGGCATGATTGCAAACGGTCTTAAGGCGTTTCCGAGGTTTGTGTTGTCTGTGGCCGTCCCGACGACTTTTATTTATAAGTAAGCCATTGTGAGCATACGCATAGCCGACATTTGACGTCGCTAACTATGCCCGGAACTAGGACACATTTAGGTAAGGTGCAGAGCTTGTTGTATCAGCATTATTAGTAGAATTATCACGTGGAGCATAATCGACCCTCGGTAGTCGTGAATTTTCTCCGCCAGTTGGGTCGCCTCACTGGCAACGATTTGGCCGCTCAGGCGAGACCTTTGGTGATATCCAACGGGATGGCGCGAGGTGTGCCAGCATGTAGATAGCCGGTTTTTCAAGTCCGGTTGGGCGCATAGGTCGTTATCTGCAGGTGATGTGTTTTAGCGCAACCTCATGCGGCGGTCTATCCCAGACAGGTCTACAGCAAGAAGTATCGCAGGTCAGAATGCGACACAGTTGTAGGGTTTGATAGAATAGTTCCCCTGTCACGAAGCTATAGCCTCCTTCCTGTCGACCCATTGCTCGGGGATTAGTTAGTGCAGTGAGCGTATACTTTTACAGAAGATGATTG
>1
TAGTTTAAGAGATCTTGTTGCAAAGCCAATCAGCACTAATTTAGTCTGGATTTAGCCGTGTTTACGTTGTTAGGGGTAGTCAGGTACGCCTCGCGGGTTTGGATCACCGGCTCAACAGGGCAGAGGCGTAACCAGCAAGGACTGAGCTAAGAACGTAGCGAGACGAAGCCATCTGACCTAGGACGCCCACTTCGGGCTACGCGGTCAACCTTACCCTGACACCGCGATCCAGCCCGTGCACCGCACGAACCGACTGACAGCACTTGCTCCCGGACCCTCACAACTTAACATACTGATCCACTGCAGAGCTGATCGATGCCCTGAATCCGTTGCGTTACTCTACTATCTCCACGCGTCTTTAGATCCGTTGTCGGATGAGATGGGCAGGGGAATCTCGATAGTCCGCTATAATGGGGTCTAGAAATGCTGTCAAATAGTATTTTGACGTTTCATGCCTTTTTGTTGGTCAGGCCAGACGGCACGTCGTGTGTTGTCTTGTACGGTCCTGGGAACAGACCCATACCGTTATAGAGCCTGTGCTAATTGGCCCGAAATCGAACAGATAACTGTGCATTTAGCAGGGGTGCGATTAGGACTGGTATTTAATGAATAAGCGCGAGGAGCATCGCCGTCATACTGCAGGCGTGCCTTAGCACAACATGTAGGAGCGTTGTCACGTCGACCCCTGACACGTGCGAACTCGCCCCTTAGTTGGATCCAGATGCATTCCCCAGGTCGCGCCAGCGTGTCGATCGGTGGTACTTCAACAGCATTTCCGATTATGTCCCTTACTCTGGATGAAAGTTGCATTACGTCGACCTGTTGCGAGCGCCAATGAGTCACTGCTCTTGAACGAAATGGATTCACGTACAGAAGCCGACACAGTATCTTAGCTGGGTAGAATCATTCCTAGACCTCGAGACTCCAACCTTGGTCCGGTGAACGGAGTGCACCTATATTCGCTACTGCTCTGTGCGGACAGTCTCATATAAAGGGACCG
>3
TAGATTACGAGAAGAGGATCGACTTCCAATCACCACAAATATAGTCCGTACTTATTGGTGCTTGTTCGTTACGGGGTAGTCGGGGACGCCTCGCGGGTTAGGGTCACGGGCAAAACAAGGTAGACGCGTAAAGCCTACGGACCTAGAGGACAACGTACAAAGGCGACGCCCTCTGATATAGGACGCCCAAATCGGGCTACGCGTTGAACATTACCATGAAAGCACTTTGCAGCGAATGAGCGACACGAACCGACTCACTGAACGATCTCCTGCACCCTCACAAGTTGACATAGTGGTCCACTCAGAAGCCGAAAGATCCCTTGAAGCTGTCTCTTTACTCTAGTTTGGCTTCGCGTCTATAGTTCTGTTGTCCGCCGACATGAGCAGGTTGCTCCCTCTAACCCCCTATTATGGGGTCTAGAAATGCGGTCCGAATATGAGTTTACGAATCATGGCTTTTGGTCGGCCACGTTAGATGCAACTTGTTGTATTTTCTTGTGTGGCCATGGCAGGAGACCCATACCGGTATAGCCCCTTTGCTAATTAGCTCTACGTATCACAGATGATCCTGCATTGGGCACGGGTTCTATGAGGACTGGTATTTAGAGAATAAGCACGAGGAGCTTCGCTGGCATACGGCAGGGGTGAATTATCCCGACAGGGAGGAGCGTTGTTATGGCGAGCCCTGACGCATTCGAAAACGCCTGTTAGTCGGAGCTAAATCCATACCCCATGTTGATCCAGCGTCTGGTACGGAGGTACTTCTTTGGCCTGTCCGCTTATGTGACATTCTAGGGATGGTCTTTGACTTCCTGTTATCAGTTGCGACAGATAAACCCGCCCAGGTCTTCAACGGAAAGGTTTACCCGACAGAAGCCCACGGGTTAGCTTAATTGGGTAGAATCATTCCTCGGCCGCGAGACTCAAACCATCTTCCGGTGTAGAGATTGAGCCAAGATGAGCTAGTTAGCTCAGCGGACACTCTCATGTAAAGGGACTT
#!/usr/bin/python
from numpy import *
import sys,time,random
import copy
minimum=30 #min length of recombinant region
maxGroupSize = 20 #largest set of breakpoints to be considered together
buffer = 5
# NB posteriors is assumed to be a global variable, a numpy array
# for arg in sys.argv:
# if '-c' in arg:
# minimum = int(arg.replace('-c',''))
def find_val(array,value):
K,M,min = shape(array)
for i in range(K):
for e in range(min): #array is assumed to have 3rd dim of 'minimum'
if array[i,M-1,e] == value:
return i,M-1,e
def traceback(startPoint,pointers):
points = [startPoint]
while 1:
current_point = points[-1]
if current_point:
points.append(pointers.get(current_point))
else:
points.remove(None)
break
return [point[0] for point in points]
def get_path(bin_vector, breakpoints):
'from a binary vector and a set of breakpoints (these vectors must be equal length), extract the unique state path'
newBreaks = []
for i,val in enumerate(bin_vector):
if val:
newBreaks.append(breakpoints[i])
state_path = []
for col in range(len(posteriors[0,:])):
if not col: # first column, initialize with max in 0th column
state_path.append(biggest(posteriors,col))
elif col in newBreaks or col<breakpoints[0] or col>breakpoints[-1]:
state_path.append(biggest(posteriors,col))
else: #col is not a breakpoint, so we remain in the same state
state_path.append(state_path[-1])
return state_path
def get_path2(bin_vector, breakpoints):
start, end = breakpoints[0],breakpoints[-1]
'from a binary vector and a set of breakpoints (these vectors must be equal length), extract the unique state path'
newBreaks = []
for i,val in enumerate(bin_vector):
if val:
newBreaks.append(breakpoints[i])
#we initialize the statepath as having a buffer on the left side...
state_path = [ biggest(posteriors,start-5) for i in range(buffer) ]
for col in range(start,end+5):
if col in newBreaks:
state_path.append(biggest(posteriors,col))
else: #col is not a breakpoint, so we remain in the same state
state_path.append(state_path[-1])
#print 'path for', bin_vector, state_path
return state_path
def get_score(state_path,breakpoints):
'returns the sum of posterior probabilities of a state path'
#print 'getting score for ', breakpoints
start = breakpoints[0]-5
end = breakpoints[-1]+5
try:
assert len(state_path) == len(range(start,end))
except:
print 'indexing error! statepath length:', len(state_path), 'length of start, end:', end-start
retVal = 0
for index,val in enumerate(range(start,end)):
try:
retVal += posteriors[state_path[index], val]
except:
print breakpoints, 'was not verifiable'
sys.exit()
return retVal
#return sum([posteriors[state,column] for column,state in enumerate(state_path)])
def get_decision_array(r):
mat=[[1],[0]]
while len(mat)!=2**r:
newlist=[]
for i,row in enumerate(mat):
newlist.append(row+[1])
row.append(0)
mat+=newlist
return array(mat)
def list_one(N,i):
lis = []
for j in range(N):
lis.append(i>>j&1)
return lis
def primitive_decoding(path):
##### primitive breakpoint cleaning-up. Really we should do this with the posterior dist. ###
breakpoints = []
for i,entry in enumerate(path):
if i==0:
continue
if entry!=path[i-1]:
breakpoints.append(i)
change = True
thru = 0
while change:
breakpointCopy = copy.deepcopy(breakpoints)
thru += 1
change = False
for i,point in enumerate(breakpoints):
if i==0:
if point<minimum:
breakpoints[i]=-1
change = True
elif abs(point-breakpointCopy[i-1]) < minimum:
breakpoints[i]=-1
change = True
while breakpoints.count(-1)!=0:
breakpoints.remove(-1)
return breakpoints
def biggest(array,col):
max_index=-1
max_value=0
rows, columns=shape(array)
#print rows, columns
for i in range(rows):
if array[i,col]>max_value:
max_index=i
max_value=array[i,col]
return max_index
def get_initial_breaks(posteriorArray,opt='None',breaks=[]):
if type(posteriorArray)==list:
path=posteriorArray
elif opt=='shortpath':
if breaks!=[]:
pass
else:
rows,cols=shape(posteriorArray)
path=[biggest(posteriorArray,0)]
# just build the state path naively, by posterior matrix:
for column in range(1,cols):
path.append(biggest(posteriorArray,column)); #print 'path remains same'
pathBreaks = []
for i,entry in enumerate(path):
if i==0:
continue
if entry!=path[i-1]:
pathBreaks.append(i)
return pathBreaks
def decoding2(posteriorArray, log=False):
final_breaks = []
initial_breakpoints = get_initial_breaks(posteriorArray)
if log:
print 'Our first guess at bps (max at each position):', initial_breakpoints
groups = find_bad_groups(initial_breakpoints)
if log:
print 'Groups of close breakpoints:', groups
for group in groups:
if len(group)>maxGroupSize:
sys.stderr.write('One group is very large, may slow down computations...size: '+str(len(group))+'\n')
continue
if log:
print 'Current group under consideration:', group
if len(group) == 1:
final_breaks.append(group[0])
if log:
print '\t Group of length one, not an issue. Continue'
continue
elif group[0]<minimum or len(posteriors[1,:])-group[-1]<minimum:
for index,value in enumerate(group):
if value<minimum or len(posteriors[1,:])-value<minimum:
group[index] = -1
while group.count(-1)>0:
group.remove(-1)
if group==[]:
continue
scoreDict = {} # we evaluate each group separately
#this dictionary stores the scores of each combination of breakpoints
L = len(group)
N = 2**len(group)
for i in xrange(N): #search over all possible (!) combinations of breakpoints, now memory-efficient!
bin_vector = list_one(L,i) #this gives the ith binary vector of length L
try:
assert len(bin_vector) == len(group)
except:
print 'problem: bin_vector not equal length to bp list!'
sys.exit(1)
#filtering out those which have breaks too close together
if good_path(bin_vector,group):
#this is the key step, so we break it apart a bit:
#in theory, a binary vector should map to exactly one path, which is the path from the first bp in 'group' to the end, where breakpoints are only allowed at those which are specified by bin_vector
path = get_path2(bin_vector,group)
score = get_score(path,group)
if log:
print bin_vector, 'has score', score
scoreDict[tuple(bin_vector)] = score
else:
continue
#now recover the maximal statepath:
for key in scoreDict.keys():
if scoreDict[key]==max(scoreDict.values()):
maxBreaks = get_initial_breaks(get_path2(key,group),opt='shortpath',breaks=group)
for i,value in enumerate(maxBreaks):
maxBreaks[i]= value-buffer+group[0]
if log:
print key, 'has full path:', maxBreaks
if not maxBreaks == []:
for index,val in enumerate(maxBreaks):
if val<group[0] or val>group[-1]:
maxBreaks[index]=-1
while maxBreaks.count(-1)>0:
maxBreaks.remove(-1)
final_breaks += maxBreaks
if log:
print key, 'is maximal, with path:', maxBreaks,'\n'
break
#test
for val in final_breaks:
if final_breaks.count(val)>1:
sys.stderr.write('Warning: breakpoints have duplicate values!')
#one final check:
if len(final_breaks)>0:
if final_breaks[0]<minimum:
final_breaks.pop(0)
sys.stderr.write('Final decoded breakpoints: '+str(final_breaks)+'\n')
return final_breaks
def good_path(bin_vector, breaks):
" boolean if bin_vector defines a good state path of values from breaks"
newBreaks = []
for i,val in enumerate(bin_vector):
if val:
newBreaks.append(breaks[i])
for i, point in enumerate(newBreaks):
if point < minimum:
return False
if len(posteriors[1,:]) - point <minimum:
return False
if not i:
continue
if abs(point - newBreaks[i-1])<minimum:
return False
return True
def find_bad_groups(breaks):
'finds groups of breakpoints which have members within minimum'
retList = [[]]
for i, point in enumerate(breaks[:-1]):
if breaks[i+1]-point <= minimum:
retList[-1].append(point)
else:
retList[-1].append(point)
retList.append([])
if retList[-1] == []: #the 2nd-last breakpoint was not close to the last one
if log:
print 'examining last bp'
if len(breaks)>0:
if len(posteriors[0,:]) - breaks[-1] < minimum :
retList[-1] += [breaks[-1],len(posteriors[1,:])-1] # a bit cryptic, but basically just deciding whether or not to keep the last breakpoint
else:
retList[-1].append(breaks[-1]) # it is a 'good' lone breakpoint, keep it
else: # the last group was never terminated!
#sys.stderr.write('Last breakpoint could be significant: '+str(breaks[-1])+' ' + str(retList)+'\n')
retList[-1].append(breaks[-1])
for subList in retList:
if len(subList)<1:
retList.remove(subList)
# sys.stderr.write('groups: '+str(retList)+'\n')
return retList
# try:
# for line in open(sys.argv[1]).readlines():
# if '=matrix(c(' in line:
# nrows = int(line[line.index('nrow=')+5:-2])
# numbers2parse = line[line.index('c(')+2: line.index(')')]
# bigList = [ float(num) for num in numbers2parse.split(',')]
# newArray = [ [] for i in range(nrows) ]
# lenRow = len(bigList)/nrows
# for i in range(nrows):
# newArray[i] = bigList[i*lenRow:(i+1)*lenRow]
# posteriors = array(newArray)
# except:
# sys.stderr.write('Problem opeing inputfiles !\n')