Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions src/metacoag/metacoag_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,9 +322,14 @@ def run(args):
# Map original contig identifiers to contig identifiers of MEGAHIT assembly graph
graph_to_contig_map = BidirectionalMap()

for (n, m), (n2, m2) in zip(graph_contigs.items(), original_contigs.items()):
if m == m2:
graph_to_contig_map[n] = n2
# Build reverse lookup: sequence -> original contig name
original_seq_to_name = {}
for name, seq in original_contigs.items():
original_seq_to_name[seq] = name

for graph_name, graph_seq in graph_contigs.items():
if graph_seq in original_seq_to_name:
graph_to_contig_map[graph_name] = original_seq_to_name[graph_seq]

graph_to_contig_map_rev = graph_to_contig_map.inverse

Expand Down Expand Up @@ -352,6 +357,12 @@ def run(args):
abundance_file=abundance_file,
)

# Assign length 0 to graph-only contigs (in GFA but not in FASTA)
# so they are excluded by all downstream min_length checks
for i in range(node_count):
if i not in contig_lengths:
contig_lengths[i] = 0

else:
sequences, coverages, contig_lengths, n_samples = feature_utils.get_cov_len(
contigs_file=contigs_file,
Expand Down Expand Up @@ -652,6 +663,7 @@ def run(args):
normalized_tetramer_profiles=normalized_tetramer_profiles,
coverages=coverages,
w_intra=w_intra,
nthreads=nthreads,
)

# Get remaining contigs with single-copy marker genes which are not assigned to bins
Expand Down Expand Up @@ -737,6 +749,7 @@ def run(args):
coverages=coverages,
depth=1,
weight=w_intra,
nthreads=nthreads,
)

logger.debug(f"Total number of binned contigs: {len(bin_of_contig)}")
Expand Down Expand Up @@ -765,6 +778,7 @@ def run(args):
coverages=coverages,
depth=depth,
weight=w_inter,
nthreads=nthreads,
)

logger.debug(f"Total number of binned contigs: {len(bin_of_contig)}")
Expand Down Expand Up @@ -891,6 +905,7 @@ def thread_function(
coverages=coverages,
depth=depth,
weight=MAX_WEIGHT,
nthreads=nthreads,
)

logger.debug(f"Total number of binned contigs: {len(bin_of_contig)}")
Expand Down
29 changes: 19 additions & 10 deletions src/metacoag/metacoag_utils/feature_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,19 +98,24 @@ def get_tetramer_profiles(
else:
kmer_inds_4, kmer_count_len_4 = compute_kmer_inds(4)

# Handle both list and dict sequences
if isinstance(sequences, dict):
seq_keys = list(sequences.keys())
seq_values = [sequences[k] for k in seq_keys]
else:
seq_keys = list(range(len(sequences)))
seq_values = sequences

pool = Pool(nthreads)
record_tetramers = pool.map(
count_kmers, [(seq, 4, kmer_inds_4, kmer_count_len_4) for seq in sequences]
count_kmers, [(seq, 4, kmer_inds_4, kmer_count_len_4) for seq in seq_values]
)
pool.close()

normalized = [x[1] for x in record_tetramers]

i = 0

for l in range(len(normalized)):
normalized_tetramer_profiles[i] = normalized[l]
i += 1
for idx, key in enumerate(seq_keys):
normalized_tetramer_profiles[key] = normalized[idx]

with open(
f"{output_path}{contigs_file}.normalized_contig_tetramers.pickle", "wb"
Expand All @@ -121,8 +126,8 @@ def get_tetramer_profiles(

tetramer_profiles = {}

for i in range(len(normalized_tetramer_profiles)):
if contig_lengths[i] >= min_length:
for i in normalized_tetramer_profiles:
if i in contig_lengths and contig_lengths[i] >= min_length:
tetramer_profiles[i] = normalized_tetramer_profiles[i]

return tetramer_profiles
Expand Down Expand Up @@ -186,19 +191,23 @@ def get_cov_len_megahit(

i = 0

sequences = []
sequences = {}

for index, record in enumerate(SeqIO.parse(contigs_file, "fasta")):
if record.id not in graph_to_contig_map_rev:
continue
contig_num = contig_names_rev[graph_to_contig_map_rev[record.id]]
length = len(record.seq)
contig_lengths[contig_num] = length
sequences.append(str(record.seq))
sequences[contig_num] = str(record.seq)
i += 1

with open(abundance_file, "r") as my_abundance:
for line in my_abundance:
strings = line.strip().split("\t")

if strings[0] not in graph_to_contig_map_rev:
continue
contig_num = contig_names_rev[graph_to_contig_map_rev[strings[0]]]

if contig_lengths[contig_num] >= min_length:
Expand Down
Loading