Skip to content

Commit 21a6042

Browse files
committed
added support for incorrectly ordered GTF entries and test cases for it
1 parent 68b199d commit 21a6042

File tree

4 files changed

+55
-2
lines changed

4 files changed

+55
-2
lines changed

swan_vis/swangraph.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def add_dataset(self, col, fname,
161161
if ftype == 'gtf':
162162
self.create_dfs_gtf(fname)
163163
elif ftype == 'db':
164-
self.create_dfs_db(fname, annot, whitelist, 'hepg2_1')
164+
self.create_dfs_db(fname, annot, whitelist, dataset_name)
165165

166166
# add column to each df to indicate where data came from
167167
self.loc_df[col] = True
@@ -175,7 +175,7 @@ def add_dataset(self, col, fname,
175175
if ftype == 'gtf':
176176
temp.create_dfs_gtf(fname)
177177
elif ftype == 'db':
178-
temp.create_dfs_db(fname, annot, whitelist, 'hepg2_1')
178+
temp.create_dfs_db(fname, annot, whitelist, dataset_name)
179179
self.merge_dfs(temp, col)
180180

181181
# remove isms if we have access to that information
@@ -541,6 +541,9 @@ def create_dfs_gtf(self, gtf_file):
541541
locs[key] = vertex_id
542542
vertex_id += 1
543543

544+
# create inverse loc dict to sort paths by
545+
locs_inv = {v: k for k, v in locs.items()}
546+
544547
# add locs-indexed path to transcripts, and populate edges
545548
edges = {}
546549
for _,t in transcripts.items():
@@ -580,6 +583,10 @@ def create_dfs_gtf(self, gtf_file):
580583
if key not in edges:
581584
edges[key] = {'edge_id': edge_id, 'edge_type': 'intron'}
582585

586+
# sort the path based on chromosomal coordinates and strand
587+
# in case there's some weird ordering in the gtf
588+
t['path'] = reorder_locs(t['path'], strand, locs_inv)
589+
583590
# turn transcripts, edges, and locs into dataframes
584591
locs = [{'chrom': key[0],
585592
'coord': key[1],

swan_vis/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,18 @@ def find_edge_start_stop(v1, v2, strand):
204204
stop = max([v1, v2])
205205
return start, stop
206206

207+
# reorder the locations in a transcript's path based on
208+
# chromosomal coordinate
209+
# TODO
210+
def reorder_locs(path, strand, locs):
211+
coords = [locs[i] for i in path]
212+
path_coords = sorted(zip(path, coords), key=lambda x: x[1])
213+
path = [i[0] for i in path_coords]
214+
coords = [i[1][1] for i in path_coords]
215+
if strand == '-':
216+
path.reverse()
217+
return path
218+
207219
# get novelty types associated with each transcript
208220
def get_transcript_novelties(fields):
209221
if fields['transcript_status'] == 'KNOWN':
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
chr6 HAVANA transcript 143815949 143832857 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_num_mappings 1; remap_status "full_contig"; remap_target_status "overlap";
2+
chr6 HAVANA exon 143815949 143816984 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 1; exon_id "ENSE00001828368.3_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143494812-143495847"; remap_status "full_contig";
3+
chr6 HAVANA exon 143818526 143818634 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 2; exon_id "ENSE00002227591.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143497389-143497497"; remap_status "full_contig";
4+
chr6 HAVANA exon 143823069 143823259 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 3; exon_id "ENSE00003473218.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143501932-143502122"; remap_status "full_contig";
5+
chr6 HAVANA exon 143823492 143823702 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 4; exon_id "ENSE00002258449.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143502355-143502565"; remap_status "full_contig";
6+
chr6 HAVANA exon 143825050 143825389 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 5; exon_id "ENSE00002248349.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143503913-143504252"; remap_status "full_contig";
7+
chr6 HAVANA exon 143828374 143828561 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 6; exon_id "ENSE00003708374.1_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143507237-143507424"; remap_status "full_contig";
8+
chr6 HAVANA exon 143832548 143832857 . - . gene_id "ENSG00000001036.14_4"; transcript_id "ENST00000002165.11_3"; gene_type "protein_coding"; gene_name "FUCA2"; transcript_type "protein_coding"; transcript_name "FUCA2-201"; exon_number 7; exon_id "ENSE00003705756.2_1"; level 2; protein_id "ENSP00000002165.5"; transcript_support_level 1; hgnc_id "HGNC:4008"; tag "basic"; tag "MANE_Select"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS5200.1"; havana_gene "OTTHUMG00000015728.3_4"; havana_transcript "OTTHUMT00000042521.3_3"; remap_original_location "chr6:-:143511411-143511720"; remap_status "full_contig";
9+
chr1 HAVANA transcript 326096 328112 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19";
10+
chr1 HAVANA exon 326096 326569 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; exon_number 1; exon_id "ENSE00002058739.1"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19";
11+
chr1 HAVANA exon 327348 328112 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; exon_number 2; exon_id "ENSE00002064640.1"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19";

testing/test_adding_datasets.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,29 @@ def test_add_annotation(self):
3939
test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1)
4040
check_pairs(control, test)
4141

42+
def test_weird_gtf(self):
43+
sg = swan.SwanGraph()
44+
sg.add_dataset('test', 'input_files/weird_gtf_entries.gtf')
45+
print(sg.t_df)
46+
47+
# check each transcript
48+
tid = 'ENST00000002165.11_3'
49+
path = sg.t_df.loc[tid, 'path']
50+
print(path)
51+
coords = sg.loc_df.loc[path, 'coord'].tolist()
52+
ctrl_coords = [143832857, 143832548, 143828561,
53+
143828374, 143825389, 143825050,
54+
143823702, 143823492, 143823259,
55+
143823069, 143818634, 143818526,
56+
143816984, 143815949]
57+
check_pairs(ctrl_coords, coords)
58+
tid = 'ENST00000514436.1'
59+
path = sg.t_df.loc[tid, 'path']
60+
print(path)
61+
coords = sg.loc_df.loc[path, 'coord'].tolist()
62+
ctrl_coords = [326096, 326569, 327348, 328112]
63+
check_pairs(ctrl_coords, coords)
64+
4265
def check_pairs(control, test):
4366
print('control')
4467
print(control)

0 commit comments

Comments
 (0)