@@ -54,7 +54,7 @@ def to_array(dictionary):
5454
5555def parse_expression_file (args , vcf_reader , vcf_writer ):
5656 if args .format == 'stringtie' and args .mode == 'transcript' :
57- df_all = read_gtf (args .expression_file )
57+ df_all = read_gtf (args .expression_file , usecols = [ 'reference_id' , 'transcript_id' , 'TPM' , 'feature' ] )
5858 df = df_all [df_all ["feature" ] == "transcript" ]
5959 id_column = resolve_stringtie_id_column (args , df .columns .values )
6060 else :
@@ -106,24 +106,23 @@ def create_vcf_writer(args, vcf_reader):
106106 output_file = args .output_vcf
107107 return vcfpy .Writer .from_path (output_file , new_header )
108108
109- def add_expressions (entry , is_multi_sample , sample_name , df , items , tag , id_column , expression_column , ignore_ensembl_id_version , missing_expressions_count , entry_count ):
110- expressions = {}
111- for item in items :
112- entry_count += 1
113- if ignore_ensembl_id_version :
114- subset = df .loc [df ['transcript_without_version' ] == re .sub (r'\.[0-9]+$' , '' , item )]
115- else :
116- subset = df .loc [df [id_column ] == item ]
117- if len (subset ) > 0 :
118- expressions [item ] = subset [expression_column ].sum ()
119- else :
120- missing_expressions_count += 1
109+ def add_expressions (entry , is_multi_sample , sample_name , df , items , tag , id_column , expression_column , ignore_ensembl_id_version , missing_expressions_count ):
110+ subset = None
111+ if ignore_ensembl_id_version :
112+ items_without_version = [re .sub (r'\.[0-9]+$' , '' , item ) for item in items ]
113+ subset = df [df ['transcript_without_version' ].isin (items_without_version )]
114+ for item in items :
115+ subset .loc [subset ['transcript_without_version' ] == re .sub (r'\.[0-9]+$' , "" , item ), id_column ] = item
116+ else :
117+ subset = df [df [id_column ].isin (items )]
118+ expressions = subset [[id_column , expression_column ]].groupby (id_column ).sum ().to_dict ()[expression_column ]
119+ missing_expressions_count += (len (items ) - len (expressions ))
121120 if is_multi_sample :
122121 entry .FORMAT += [tag ]
123122 entry .call_for_sample [sample_name ].data [tag ] = to_array (expressions )
124123 else :
125124 entry .add_format (tag , to_array (expressions ))
126- return (entry , missing_expressions_count , entry_count )
125+ return (entry , missing_expressions_count )
127126
128127def define_parser ():
129128 parser = argparse .ArgumentParser (
@@ -216,11 +215,13 @@ def main(args_input = sys.argv[1:]):
216215 if args .mode == 'gene' :
217216 genes = list (genes )
218217 if len (genes ) > 0 :
219- (entry , missing_expressions_count , entry_count ) = add_expressions (entry , is_multi_sample , args .sample_name , df , genes , 'GX' , id_column , expression_column , args .ignore_ensembl_id_version , missing_expressions_count , entry_count )
218+ (entry , missing_expressions_count ) = add_expressions (entry , is_multi_sample , args .sample_name , df , genes , 'GX' , id_column , expression_column , args .ignore_ensembl_id_version , missing_expressions_count )
219+ entry_count += len (genes )
220220 elif args .mode == 'transcript' :
221221 transcript_ids = list (transcript_ids )
222222 if len (transcript_ids ) > 0 :
223- (entry , missing_expressions_count , entry_count ) = add_expressions (entry , is_multi_sample , args .sample_name , df , transcript_ids , 'TX' , id_column , expression_column , args .ignore_ensembl_id_version , missing_expressions_count , entry_count )
223+ (entry , missing_expressions_count ) = add_expressions (entry , is_multi_sample , args .sample_name , df , transcript_ids , 'TX' , id_column , expression_column , args .ignore_ensembl_id_version , missing_expressions_count )
224+ entry_count += len (transcript_ids )
224225 vcf_writer .write_record (entry )
225226
226227 vcf_reader .close ()
0 commit comments