Skip to content

Commit 51371de

Browse files
Merge pull request #79 from griffithlab/exp_speed
Refactor vcf-expression-annotator to be faster
2 parents 9d19ac3 + 2d96b88 commit 51371de

1 file changed

Lines changed: 17 additions & 16 deletions

File tree

vatools/vcf_expression_annotator.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def to_array(dictionary):
5454

5555
def parse_expression_file(args, vcf_reader, vcf_writer):
5656
if args.format == 'stringtie' and args.mode == 'transcript':
57-
df_all = read_gtf(args.expression_file)
57+
df_all = read_gtf(args.expression_file, usecols=['reference_id', 'transcript_id', 'TPM', 'feature'])
5858
df = df_all[df_all["feature"] == "transcript"]
5959
id_column = resolve_stringtie_id_column(args, df.columns.values)
6060
else:
@@ -106,24 +106,23 @@ def create_vcf_writer(args, vcf_reader):
106106
output_file = args.output_vcf
107107
return vcfpy.Writer.from_path(output_file, new_header)
108108

109-
def add_expressions(entry, is_multi_sample, sample_name, df, items, tag, id_column, expression_column, ignore_ensembl_id_version, missing_expressions_count, entry_count):
110-
expressions = {}
111-
for item in items:
112-
entry_count += 1
113-
if ignore_ensembl_id_version:
114-
subset = df.loc[df['transcript_without_version'] == re.sub(r'\.[0-9]+$', '', item)]
115-
else:
116-
subset = df.loc[df[id_column] == item]
117-
if len(subset) > 0:
118-
expressions[item] = subset[expression_column].sum()
119-
else:
120-
missing_expressions_count += 1
109+
def add_expressions(entry, is_multi_sample, sample_name, df, items, tag, id_column, expression_column, ignore_ensembl_id_version, missing_expressions_count):
110+
subset = None
111+
if ignore_ensembl_id_version:
112+
items_without_version = [re.sub(r'\.[0-9]+$', '', item) for item in items]
113+
subset = df[df['transcript_without_version'].isin(items_without_version)]
114+
for item in items:
115+
subset.loc[subset['transcript_without_version'] == re.sub(r'\.[0-9]+$', "", item), id_column] = item
116+
else:
117+
subset = df[df[id_column].isin(items)]
118+
expressions = subset[[id_column, expression_column]].groupby(id_column).sum().to_dict()[expression_column]
119+
missing_expressions_count += (len(items) - len(expressions))
121120
if is_multi_sample:
122121
entry.FORMAT += [tag]
123122
entry.call_for_sample[sample_name].data[tag] = to_array(expressions)
124123
else:
125124
entry.add_format(tag, to_array(expressions))
126-
return (entry, missing_expressions_count, entry_count)
125+
return (entry, missing_expressions_count)
127126

128127
def define_parser():
129128
parser = argparse.ArgumentParser(
@@ -216,11 +215,13 @@ def main(args_input = sys.argv[1:]):
216215
if args.mode == 'gene':
217216
genes = list(genes)
218217
if len(genes) > 0:
219-
(entry, missing_expressions_count, entry_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, genes, 'GX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count, entry_count)
218+
(entry, missing_expressions_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, genes, 'GX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count)
219+
entry_count += len(genes)
220220
elif args.mode == 'transcript':
221221
transcript_ids = list(transcript_ids)
222222
if len(transcript_ids) > 0:
223-
(entry, missing_expressions_count, entry_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, transcript_ids, 'TX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count, entry_count)
223+
(entry, missing_expressions_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, transcript_ids, 'TX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count)
224+
entry_count += len(transcript_ids)
224225
vcf_writer.write_record(entry)
225226

226227
vcf_reader.close()

0 commit comments

Comments
 (0)