Skip to content
Snippets Groups Projects
Unverified Commit f315b884 authored by Ian Fiddes's avatar Ian Fiddes Committed by GitHub
Browse files

Merge pull request #154 from eernst/fix-transcript-ids-munging

Fix for transcript IDs like rice from ensembl plants: Os01t0100500-01
parents eefbf102 07afad77
Loading
...@@ -387,18 +387,21 @@ def resolve_split_genes(tmp_size_filtered, transcript_gene_map, resolved_df, unf ...@@ -387,18 +387,21 @@ def resolve_split_genes(tmp_size_filtered, transcript_gene_map, resolved_df, unf
""" """
Use localNearBest algorithm to determine split genes and populate that field Use localNearBest algorithm to determine split genes and populate that field
""" """
with tools.fileOps.TemporaryFilePath() as local_tmp: with tools.fileOps.TemporaryFilePath() as local_tmp, tools.fileOps.TemporaryFilePath() as stripped_tmp:
cmd = [['sed', 's/\-[0-9]\+//', tmp_size_filtered], # strip unique identifiers for comparative filters with open(stripped_tmp, 'w') as outf:
['pslCDnaFilter', '-localNearBest=0.05', for rec in tools.psl.psl_iterator(tmp_size_filtered):
'-minCover=0.1', '-verbose=0', rec.q_name = tools.nameConversions.strip_alignment_numbers(rec.q_name)
'-minSpan=0.2', '/dev/stdin', '/dev/stdout']] tools.fileOps.print_row(outf, rec.psl_string())
cmd = ['pslCDnaFilter', '-localNearBest=0.05',
'-minCover=0.1', '-verbose=0',
'-minSpan=0.2', stripped_tmp, '/dev/stdout']
tools.procOps.run_proc(cmd, stdout=local_tmp) tools.procOps.run_proc(cmd, stdout=local_tmp)
filtered_alns = list(tools.psl.psl_iterator(local_tmp)) filtered_alns = list(tools.psl.psl_iterator(local_tmp))
# remove alignments that we didn't resolve # remove alignments that we didn't resolve
resolved_ids = set(resolved_df.TranscriptId) resolved_ids = set(resolved_df.TranscriptId)
filtered_alns = [x for x in filtered_alns if x.q_name in resolved_ids] filtered_alns = [x for x in filtered_alns if x.q_name in resolved_ids]
grouped = tools.psl.group_alignments_by_qname(filtered_alns) grouped = tools.psl.group_alignments_by_qname(filtered_alns, strip=False)
# construct the transcript interval for resolved transcripts # construct the transcript interval for resolved transcripts
tx_intervals = {tx_id: unfiltered_tx_dict[aln_id].interval for tx_intervals = {tx_id: unfiltered_tx_dict[aln_id].interval for
...@@ -418,3 +421,4 @@ def resolve_split_genes(tmp_size_filtered, transcript_gene_map, resolved_df, unf ...@@ -418,3 +421,4 @@ def resolve_split_genes(tmp_size_filtered, transcript_gene_map, resolved_df, unf
'Number of intra-contig split genes': len(split_gene_data['intra'])} 'Number of intra-contig split genes': len(split_gene_data['intra'])}
return merged, split_gene_metrics return merged, split_gene_metrics
...@@ -168,9 +168,13 @@ def get_alignment_dict(psl_file, make_unique=False): ...@@ -168,9 +168,13 @@ def get_alignment_dict(psl_file, make_unique=False):
return {psl.q_name: psl for psl in psl_iterator(psl_file, make_unique)} return {psl.q_name: psl for psl in psl_iterator(psl_file, make_unique)}
def group_alignments_by_qname(psl_iterator): def group_alignments_by_qname(psl_iterator, strip=True):
"""Convenience function for grouping PSLs by their name""" """Convenience function for grouping PSLs by their name"""
r = defaultdict(list) r = defaultdict(list)
for p in psl_iterator: if strip is True:
r[strip_alignment_numbers(p.q_name)].append(p) for p in psl_iterator:
r[strip_alignment_numbers(p.q_name)].append(p)
else:
for p in psl_iterator:
r[p.q_name].append(p)
return r return r
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment