Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
7cb437e
integrate site-specific weights
trongnhanuit Feb 17, 2026
b8d8103
expand site_pattern according to the site weights
trongnhanuit Feb 17, 2026
de08e6a
Improve validation and show more informative messages
trongnhanuit Feb 19, 2026
79a9bd7
ignore softwipe_build
trongnhanuit Mar 2, 2026
9e3c104
First implementation of site-specific floating weights -> inefficient…
trongnhanuit Mar 2, 2026
532adef
improve the implementation of site-specific floating weights -> only …
trongnhanuit Mar 2, 2026
489f62f
show Total site floating weight
trongnhanuit Mar 2, 2026
61d131d
Add an error "Site-specific weights do not work with partition models…
trongnhanuit Mar 2, 2026
6c6f3f6
Bayesian bootstrap, first implementation
trongnhanuit Mar 10, 2026
aca4060
allow "BaYeS" in both upper and lower case; validate the input format…
trongnhanuit Mar 10, 2026
de378be
don't allow a scaling factor of zero
trongnhanuit Mar 10, 2026
0f2d485
don't allow a zero scaling factor
trongnhanuit Mar 10, 2026
f37b26f
Introduce the "-wbsw" option to output bootstrap site weights, which …
trongnhanuit Mar 25, 2026
ae81805
Merge branch 'master' into site_weights_claude
trongnhanuit Mar 25, 2026
4d10a05
fix a recompilation error
trongnhanuit Mar 25, 2026
66760b1
Fix a compilation error
trongnhanuit Mar 25, 2026
72b2c1e
Introduce the '-czbb' (--polytomy-boot) option to collapse near-zero …
trongnhanuit Mar 26, 2026
d97f6b5
Re-align the console message
trongnhanuit Mar 26, 2026
8dc0856
support writing site weights for normal bootstrap with partition models
trongnhanuit Mar 26, 2026
7e8c7c2
extend Bayesian bootstrap to partition models
trongnhanuit Mar 26, 2026
24eef22
implement UFBoot + Bayesian Bootstrap
trongnhanuit Mar 27, 2026
07a7793
support '-wbsw' for the normal UFBoot
trongnhanuit Mar 27, 2026
1c1deb0
Show a warning of not collapsing near-zero branches if using Bayesian…
trongnhanuit Mar 27, 2026
7dfb5da
automatically set the min_branch_length accordingly to the alignment …
trongnhanuit Apr 24, 2026
38f8154
remove the unused variable pos
trongnhanuit May 21, 2026
5008908
read floating site-weights earlier instead of in the likelihood function
trongnhanuit May 21, 2026
34f2528
add a note that near-zero branches are collapsed in bootstrap trees w…
trongnhanuit May 21, 2026
d6077b5
add the helper function writeWeightsRow() to avoid duplicating codes
trongnhanuit May 21, 2026
38d7e33
remove unused debug message
trongnhanuit May 21, 2026
bcafb28
avoid changing getNSites
trongnhanuit May 21, 2026
51a2368
Merge branch 'master' into site_weights_claude
trongnhanuit May 21, 2026
e1cb751
update Bayesian Bootstrap according to the alignment refactor by Stef…
trongnhanuit May 22, 2026
74aaa7e
remove --polytomy-boot in help
trongnhanuit May 22, 2026
246ebeb
validate the input safer before extracting the scaling factor
trongnhanuit May 22, 2026
401136e
remove unused variable
trongnhanuit May 22, 2026
e036a89
recompute frac_const_sites and frac_invariant_sites according to site…
trongnhanuit May 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ vectorclass/
zlib-1.2.7/
/.direnv/
/.envrc
softwipe_build
softwipe_build/
240 changes: 240 additions & 0 deletions alignment/alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,13 @@ Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, str
if (getNSeq() < 3) {
outError("Alignment must have at least 3 sequences");
}

// integrate site-specific weights, if specified
if (Params::getInstance().site_weights_file != "")
integrateSiteSpecWeights();
if (Params::getInstance().site_float_weights_file != "")
readSiteSpecFloatWeights();

double constCountStart = getRealTime();
countConstSites();
if (verbose_mode >= VB_MED) {
Expand Down Expand Up @@ -843,6 +850,177 @@ Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, str

}

void Alignment::readSiteSpecFloatWeights()
{
const string& site_float_weights_file =
Params::getInstance().site_float_weights_file;
ASSERT(site_float_weights_file != "");

std::cout << "Reading site-specific floating weights from file "
<< site_float_weights_file << "..." << std::endl;

std::ifstream in(site_float_weights_file);
if (!in)
{
throw std::runtime_error(
"Failed to open site-specific floating weight file: " + site_float_weights_file);
}

// pre-allocate the memory
DoubleVector site_float_weights;
// update aln->ori_num_sites, if it's not been set
if (!(ori_num_sites > 0))
{
ori_num_sites = getNSite();
}
site_float_weights.reserve(ori_num_sites);

std::string token;
while (in >> token)
{
double weight;
try
{
weight = convert_double(token.c_str());
}
catch (...)
{
throw std::runtime_error(
"Invalid site weight '" + token +
"': weights must be positive floating-point numbers");
}

if (!(weight > 0.0))
{
throw std::runtime_error(
"Site floating weights must be positive (found " + token + ")");
}

site_float_weights.push_back(weight);
}

// Check the number of weights
if (site_float_weights.size() != ori_num_sites)
{
throw std::runtime_error(
"The number of floating weights ("
+ convertIntToString(site_float_weights.size())
+ ") differs from the number of sites ("
+ convertIntToString(ori_num_sites) + ")!");
}

// initialize all ptn_freq at 0
pattern_weight.resize(getNPattern(), 0.0);

// integrate the site-specific floating weights into the pattern frequencies
double total_site_float_weight = 0.0;
for (size_t i = 0; i < site_float_weights.size(); ++i)
{
pattern_weight[site_pattern[i]] += site_float_weights[i];

// update total_site_float_weight
total_site_float_weight += site_float_weights[i];
}

// Show info
std::cout << "Total site floating weight: " << total_site_float_weight << std::endl;
}

void Alignment::integrateSiteSpecWeights()
{
const string& site_weights_file = Params::getInstance().site_weights_file;
ASSERT(site_weights_file != "");

std::cout << "Reading site-specific weights from file " << site_weights_file << "..." << std::endl;

// read site-specific weights from file
std::ifstream in(site_weights_file);
if (!in)
{
throw std::runtime_error("Failed to open site-specific weight file: " + site_weights_file);
}

// pre-allocate the memory
std::vector<int> site_weights;
site_weights.reserve(getNSite());

// read the weights
std::string token;
int total_weight = 0;
while (in >> token)
{
// Check that the token consists of digits only
if (token.empty() ||
!std::all_of(token.begin(), token.end(),
[](unsigned char c) { return std::isdigit(c); }))
{
throw std::runtime_error(
"Invalid site weight '" + token +
"': weights must be positive integers");
}

// validate the weights
int weight = convert_int(token.c_str());
if (weight <= 0)
{
throw std::runtime_error(
"Site weights must be positive integers (found " + token + ")");
}
site_weights.push_back(weight);
total_weight += weight;
}

// Check the number of weights
if (site_weights.size() != getNSite())
{
throw std::runtime_error("The number of weights ("
+ convertIntToString(site_weights.size())
+ ") differs from the number of sites ("
+ convertIntToString(getNSite()) + ")!");
}

// Show info
std::cout << "Total site weight: " << total_weight << std::endl;

// expand the alignment (adding columns according to the site weights)
size_t expanding_site_id = site_pattern.size();
// record the original number of sites before expanding the site_pattern
ori_num_sites = expanding_site_id;
site_pattern.resize(total_weight);

// integrate the site-specific weights into the pattern frequencies
for (size_t i = 0; i < site_weights.size(); ++i)
{
// extract the pattern id of the current site
const int site_pattern_id = site_pattern[i];

// update the freq of the pattern according to the site weight
at(site_pattern_id).frequency += (site_weights[i] - 1);

// set the pattern id to expanded sites
for (int j = 0; j < site_weights[i] - 1; ++j)
{
site_pattern[expanding_site_id++] = site_pattern_id;
}
}
}

void Alignment::createBayesBootWeights(DoubleVector &site_weights, size_t nsite, int *rstream)
{
site_weights.resize(nsite);
double sum = 0.0;
// Draw n independent Exponential(1) samples — this gives a
// Dirichlet(1,...,1) distribution after normalization (Rubin 1981).
for (size_t i = 0; i < nsite; ++i) {
site_weights[i] = random_double_exponential_distribution(1.0, rstream);
sum += site_weights[i];
}
// Normalize so weights sum to nsite (expected weight per site = 1.0)
const double scale = (double)nsite / sum;
for (size_t i = 0; i < nsite; ++i)
site_weights[i] *= scale;
}

Alignment::Alignment(NxsDataBlock *data_block, char *sequence_type, string model)
: Alignment() {
this->model_name = model;
Expand All @@ -853,7 +1031,15 @@ Alignment::Alignment(NxsDataBlock *data_block, char *sequence_type, string model
if (getNSeq() < 3) {
outError("Alignment must have at least 3 sequences");
}

// integrate site-specific weights, if specified
if (Params::getInstance().site_weights_file != "")
integrateSiteSpecWeights();
if (Params::getInstance().site_float_weights_file != "")
readSiteSpecFloatWeights();

countConstSites();

if (Params::getInstance().compute_seq_composition) {
cout << "Alignment has " << getNSeq() << " sequences with " << getNSite()
<< " columns, " << getNPattern() << " distinct patterns" << endl
Expand Down Expand Up @@ -891,12 +1077,20 @@ Alignment::Alignment(StrVector& names, StrVector& seqs, char *sequence_type, str
}
double readStart = getRealTime();
readStrVec(names, seqs, sequence_type);

if (verbose_mode >= VB_MED) {
cout << "Time to read input file was " << (getRealTime() - readStart) << " sec." << endl;
}
if (getNSeq() < 3) {
outError("Alignment must have at least 3 sequences");
}

// integrate site-specific weights, if specified
if (Params::getInstance().site_weights_file != "")
integrateSiteSpecWeights();
if (Params::getInstance().site_float_weights_file != "")
readSiteSpecFloatWeights();

double constCountStart = getRealTime();
countConstSites();
if (verbose_mode >= VB_MED) {
Expand Down Expand Up @@ -4056,6 +4250,7 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq
memcpy(non_stop_codon, aln->non_stop_codon, strlen(genetic_code));
}
STATE_UNKNOWN = aln->STATE_UNKNOWN;

site_pattern.clear();
pattern_index.clear();
clear();
Expand All @@ -4074,6 +4269,7 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq
// resampling also the per-site state frequency vector
outError("Unsupported bootstrap feature, pls contact the developers");
}

if (Params::getInstance().jackknife_prop > 0.0 && spec) {
outError((string)"Unsupported jackknife with sampling " + spec);
}
Expand All @@ -4083,6 +4279,7 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq
size_t nsite = aln->getNSite();
IntVector sample;
random_resampling(nsite, sample);
boot_site_weights.assign(sample.begin(), sample.end());
for (size_t site = 0; site < nsite; ++site) {
for (int rep = 0; rep < sample[site]; ++rep) {
int ptn = aln->getPatternID(site);
Expand All @@ -4103,6 +4300,29 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq
}
}
}
} else if (strncasecmp(spec, "BAYES", 5) == 0) {
size_t nsite = aln->getNSite();
// Bayesian bootstrap (Rubin 1981): Dirichlet(1,...,1) site weights.
// Float weights → ML via pattern_weight[]; scaled integer frequencies → parsimony.
int pars_scale = (spec[5] == ':' && isdigit((unsigned char)spec[6]))
? max(1, convert_int(spec + 6)) : 10;
const size_t nptn = aln->getNPattern();
DoubleVector site_weights;
createBayesBootWeights(site_weights, nsite);
boot_site_weights = site_weights;
// Accumulate per-pattern float weights
vector<double> ptn_float_weight(nptn, 0.0);
for (size_t i = 0; i < nsite; ++i)
ptn_float_weight[aln->getPatternID(i)] += site_weights[i];
// Per-pattern integer frequencies for parsimony (min 1 so no site is silenced).
// addPattern() fills site_pattern with pat.frequency copies of the pattern index.
for (size_t ptn = 0; ptn < nptn; ++ptn) {
Pattern pat = aln->at(ptn);
pat.frequency = max(1, (int)round(ptn_float_weight[ptn] * pars_scale));
addPattern(pat);
}
// Float weights for ML (picked up by PhyloTree::computePtnFreq)
pattern_weight.assign(ptn_float_weight.begin(), ptn_float_weight.end());
} else if (strncmp(spec, "GENESITE,", 9) == 0) {
// resampling genes, then resampling sites within resampled genes
convert_int_vec(spec+9, site_vec);
Expand Down Expand Up @@ -4184,7 +4404,27 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq
if (aln->isSSF()) {
ASSERT(ptn_state_freq.size() == getNPattern());
}

// NHANLT: this line is removed by StefanFlauberg
// verbose_mode = save_mode;
countConstSites();
// For Bayesian bootstrap, countConstSites() divides by getNSite() which equals the
// sum of pars_scale-scaled integer frequencies (floored to 1), not the original site
// count. Recompute the fraction fields from pattern_weight[] (true Dirichlet floats
// that sum to nsite) so downstream rate optimisation (e.g. pinvar) stays correct.
// num_informative_sites / num_variant_sites from countConstSites() are kept as-is
// because parsimony uses the integer frequencies.
if (spec && strncasecmp(spec, "BAYES", 5) == 0 && !pattern_weight.empty()) {
double const_weight = 0.0, invariant_weight = 0.0;
const double total_weight = aln->getNSite(); // pattern_weight sums to nsite
for (size_t ptn = 0; ptn < pattern_weight.size(); ++ptn) {
if (at(ptn).isConst()) const_weight += pattern_weight[ptn];
if (at(ptn).isInvariant()) invariant_weight += pattern_weight[ptn];
}
frac_const_sites = const_weight / total_weight;
frac_invariant_sites = invariant_weight / total_weight;
}
// buildSeqStates();
}

void Alignment::createBootstrapAlignment(IntVector &pattern_freq, const char *spec) {
Expand Down
44 changes: 43 additions & 1 deletion alignment/alignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -928,7 +928,12 @@ class Alignment : public vector<Pattern>, public CharSet, public StateSpace {
* Virtual population size for PoMo model
*/
int virtual_pop_size;


/**
* the original number of site before expanding alignment with site-specific weights
*/
int ori_num_sites = -1;

// TODO DS: Maybe change default to SAMPLING_WEIGHTED_HYPER.
/// The sampling method (defaults to SAMPLING_WEIGHTED_BINOM).
SamplingType pomo_sampling_method;
Expand All @@ -943,6 +948,24 @@ class Alignment : public vector<Pattern>, public CharSet, public StateSpace {
IntIntMap pomo_sampled_states_index; // indexing, to quickly find if a PoMo-2-state is already present

/* for site-specific state frequency model with Huaichun, Edward, Andrew */

/* site to model ID map */
IntVector site_model;

/** site to state frequency vector */
vector<double*> site_state_freq;

/** pattern weights
i.e., the pattern freqs that integrate site-specific floating weights
*/
DoubleVector pattern_weight;

/**
* Per-site resample weights populated by createBootstrapAlignment().
* Standard bootstrap: integer counts (as doubles); Bayesian bootstrap: Dirichlet floats.
* Length = getNSite() of the source alignment.
*/
DoubleVector boot_site_weights;

/** pattern index to state frequency vector map */
vector<double*> ptn_state_freq;
Expand Down Expand Up @@ -1063,6 +1086,20 @@ class Alignment : public vector<Pattern>, public CharSet, public StateSpace {
* @return id the genetic code id, or 0 if not a codon type
*/
int getGeneticCodeId();

/**
Read site-specific floating weights
*/
void readSiteSpecFloatWeights();

/**
* Draw per-site Dirichlet(1,...,1) weights for Bayesian bootstrap.
* Samples getNSite() Exponential(1) values and normalizes so they sum
* to getNSite() (expected weight per site = 1.0).
* @param site_weights [OUT] per-site weights, length = getNSite()
* @param rstream optional random stream (NULL = global randstream)
*/
void createBayesBootWeights(DoubleVector &site_weights, size_t nsite, int *rstream = NULL);

protected:
/**
Expand Down Expand Up @@ -1108,6 +1145,11 @@ class Alignment : public vector<Pattern>, public CharSet, public StateSpace {
Output a mutation into Maple file
*/
void outputMutation(std::ofstream &out, char state_char, int32_t pos, int32_t length = -1);

/**
Integrate site-specific weights
*/
void integrateSiteSpecWeights();
};


Expand Down
Loading
Loading