Skip to content

Commit 9537734

Browse files
committed
fixed issues from unordered gtf accomodations
1 parent 21a6042 commit 9537734

File tree

4 files changed

+130
-25
lines changed

4 files changed

+130
-25
lines changed

swan_vis/swangraph.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -541,16 +541,16 @@ def create_dfs_gtf(self, gtf_file):
541541
locs[key] = vertex_id
542542
vertex_id += 1
543543

544-
# create inverse loc dict to sort paths by
545-
locs_inv = {v: k for k, v in locs.items()}
546-
547544
# add locs-indexed path to transcripts, and populate edges
548545
edges = {}
549546
for _,t in transcripts.items():
550547
t['path'] = []
551548
strand = t['strand']
552549
t_exons = t['exons']
553550

551+
# reorder exons that are in weird orders from the GTF
552+
t_exons = reorder_exons(t_exons)
553+
554554
for i, exon_id in enumerate(t_exons):
555555

556556
# pull some information from exon dict
@@ -583,10 +583,6 @@ def create_dfs_gtf(self, gtf_file):
583583
if key not in edges:
584584
edges[key] = {'edge_id': edge_id, 'edge_type': 'intron'}
585585

586-
# sort the path based on chromosomal coordinates and strand
587-
# in case there's some weird ordering in the gtf
588-
t['path'] = reorder_locs(t['path'], strand, locs_inv)
589-
590586
# turn transcripts, edges, and locs into dataframes
591587
locs = [{'chrom': key[0],
592588
'coord': key[1],
@@ -1018,7 +1014,13 @@ def find_ir_genes(self):
10181014
sub_nodes = [i for i in range(eid[0]+1,eid[1])]
10191015
sub_G = self.G.subgraph(sub_nodes)
10201016
sub_edges = list(sub_G.edges())
1021-
sub_edges = self.edge_df.loc[sub_edges]
1017+
try:
1018+
sub_edges = self.edge_df.loc[sub_edges]
1019+
except:
1020+
for blop in sub_edges:
1021+
if blop not in self.edge_df.edge_id.tolist():
1022+
print(blop)
1023+
continue
10221024
sub_edges = sub_edges.loc[sub_edges.edge_type == 'intron']
10231025

10241026
if len(sub_edges.index) > 0:

swan_vis/utils.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -204,18 +204,27 @@ def find_edge_start_stop(v1, v2, strand):
204204
stop = max([v1, v2])
205205
return start, stop
206206

207-
# reorder the locations in a transcript's path based on
208-
# chromosomal coordinate
209-
# TODO
210-
def reorder_locs(path, strand, locs):
211-
coords = [locs[i] for i in path]
212-
path_coords = sorted(zip(path, coords), key=lambda x: x[1])
213-
path = [i[0] for i in path_coords]
214-
coords = [i[1][1] for i in path_coords]
207+
# reorder exon ids from create_dfs_gtf
208+
def reorder_exons(exon_ids):
209+
strand = exon_ids[0].split('_')[-2]
210+
coords = [int(i.split('_')[-4]) for i in exon_ids]
211+
exons = sorted(zip(exon_ids, coords), key=lambda x: x[1])
212+
exons = [i[0] for i in exons]
215213
if strand == '-':
216-
path.reverse()
217-
return path
218-
214+
exons.reverse()
215+
return exons
216+
217+
# # reorder the locations in a transcript's path based on
218+
# # chromosomal coordinate
219+
# def reorder_locs(path, strand, locs):
220+
# coords = [locs[i] for i in path]
221+
# path_coords = sorted(zip(path, coords), key=lambda x: x[1])
222+
# path = [i[0] for i in path_coords]
223+
# coords = [i[1][1] for i in path_coords]
224+
# if strand == '-':
225+
# path.reverse()
226+
# return path
227+
219228
# get novelty types associated with each transcript
220229
def get_transcript_novelties(fields):
221230
if fields['transcript_status'] == 'KNOWN':

0 commit comments

Comments
 (0)