fix(global-standards): Enable multiple precursors for the same reference peptide (#180)

tonywu1999 · web-flow · commit de817e134819 · 2026-01-29T15:58:32.000-05:00
diff --git a/R/utils_normalize.R b/R/utils_normalize.R
@@ -192,32 +192,31 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
 .normalizeGlobalStandards = function(input, peptides_dict, standards) {
     PeptideSequence = PEPTIDE = PROTEIN = median_by_fraction = NULL
     Standard = FRACTION = LABEL = ABUNDANCE = RUN = GROUP = NULL
-    
-    proteins = as.character(unique(input$PROTEIN))
-    means_by_standard = unique(input[, list(RUN)])
-    for (standard_id in seq_along(standards)) {
-        peptide_name = unlist(peptides_dict[PeptideSequence == standards[standard_id],
-                                            as.character(PEPTIDE)], FALSE, FALSE)
-        if (length(peptide_name) > 0) {
-            standard = input[PEPTIDE == peptide_name, ]
-        } else {
-            if (standards[standard_id] %in% proteins) {
-                standard = input[PROTEIN == standards[standard_id], ]
-            } else {
-                msg = paste("global standard peptides or proteins, ",
-                            standards[standard_id],", is not in dataset.",
-                            "Please check whether 'nameStandards' input is correct or not.")
-                getOption("MSstatsLog")("ERROR", msg)
-                stop(msg)
-            }
-        }
-        mean_by_run = standard[GROUP != "0" & !is.na(ABUNDANCE),
-                               list(mean_abundance = mean(ABUNDANCE, na.rm = TRUE)),
-                               by = "RUN"]
-        colnames(mean_by_run)[2] = paste0("meanStandard", standard_id)
-        means_by_standard = merge(means_by_standard, mean_by_run,
-                                  by = "RUN", all.x = TRUE)
+
+    input_with_peptides <- merge(input, peptides_dict, by = "PEPTIDE", all.x = TRUE)
+    standards_data <- input_with_peptides[
+        (PeptideSequence %in% standards | PROTEIN %in% standards) & 
+            GROUP != "0" & 
+            !is.na(ABUNDANCE)
+    ]
+    missing_standards <- standards[!standards %in% c(standards_data$PeptideSequence, standards_data$PROTEIN)]
+    if (length(missing_standards) > 0) {
+        msg <- paste("Global standard peptides or proteins,",
+                     paste(missing_standards, collapse = ", "),
+                     "are not in dataset. Please check whether 'nameStandards' input is correct.")
+        getOption("MSstatsLog")("ERROR", msg)
+        stop(msg)
     }
+    standards_data[, standard := ifelse(!is.na(PeptideSequence) & PeptideSequence %in% standards,
+                                        PeptideSequence,
+                                        PROTEIN)]
+    means_by_standard <- standards_data[, 
+                                        list(mean_abundance = mean(ABUNDANCE, na.rm = TRUE)),
+                                        by = .(RUN, standard)
+    ]
+    means_by_standard <- dcast(means_by_standard, 
+                               RUN ~ standard, 
+                               value.var = "mean_abundance")
     means_by_standard = data.table::melt(means_by_standard, id.vars = "RUN",
                                          variable.name = "Standard", value.name = "ABUNDANCE")
     means_by_standard[, mean_by_run := mean(ABUNDANCE, na.rm = TRUE), by = "RUN"]
diff --git a/inst/tinytest/test_utils_normalize.R b/inst/tinytest/test_utils_normalize.R
@@ -0,0 +1,111 @@
+# Test for .normalizeGlobalStandards function ----------------------------------
+
+# Setup test data ---------------------------------------------------------------
+
+create_peptide_dictionary <- function() {
+    data.table::data.table(
+        PeptideSequence = c("AAAAAAAAAAAAAAGAGAGAK", "AAAAAAAAAAAAAAGAGAGAK", "AAAAAAAAAAAAVSRD"),
+        PrecursorCharge = c(3, 2, 3),
+        PEPTIDE = c("AAAAAAAAAAAAAAGAGAGAK_3", "AAAAAAAAAAAAAAGAGAGAK_2", "AAAAAAAAAAAAVSRD_3")
+    )
+}
+
+create_test_input <- function(standard_intensities, peptide2_intensities, peptide3_intensities) {
+    # Constants
+    n_proteins <- 3
+    n_runs <- 48
+    n_subjects <- 4
+    n_fractions <- 6
+    
+    # Create base structure
+    input <- data.table::data.table(
+        PROTEIN = rep(c("P1", "P1", "P3"), each = n_runs),
+        PEPTIDE = rep(c("AAAAAAAAAAAAAAGAGAGAK_3", "AAAAAAAAAAAAAAGAGAGAK_2", "AAAAAAAAAAAAVSRD_3"), 
+                      each = n_runs),
+        TRANSITION = rep("NA_NA", n_proteins * n_runs),
+        FEATURE = rep(c("AAAAAAAAAAAAAAGAGAGAK_3_NA_NA", 
+                        "AAAAAAAAAAAAAAGAGAGAK_2_NA_NA", 
+                        "AAAAAAAAAAAAVSRD_3_NA_NA"), 
+                      each = n_runs),
+        LABEL = rep("L", n_proteins * n_runs),
+        GROUP_ORIGINAL = rep(rep(c("Control", "Treatment"), each = n_runs/2), n_proteins),
+        SUBJECT_ORIGINAL = rep(paste0("Subject", rep(1:n_subjects, each = n_fractions)), 
+                               n_proteins * 2),
+        RUN = rep(1:n_runs, n_proteins),
+        GROUP = rep(rep(c("Control", "Treatment"), each = n_runs/2), n_proteins),
+        SUBJECT = rep(paste0("Subject", rep(1:n_subjects, each = n_fractions)), 
+                      n_proteins * 2),
+        FRACTION = rep(rep(1:n_fractions, n_subjects), n_proteins * 2),
+        INTENSITY = c(standard_intensities, peptide2_intensities, peptide3_intensities),
+        ANOMALYSCORES = rep(NA, n_proteins * n_runs),
+        originalRUN = rep(paste0("Run", 1:n_runs), n_proteins)
+    )
+    
+    input[, ABUNDANCE := log2(INTENSITY)]
+    return(input)
+}
+
+# Test 1: Standards with different intensities between groups -------------------
+test_different_group_intensities <- function() {
+    peptide_dict <- create_peptide_dictionary()
+    standards <- c("AAAAAAAAAAAAAAGAGAGAK")
+    
+    # Control group: 262144, Treatment group: 524288
+    standard_intensities <- c(
+        rep(262144, 24),  # Control (runs 1-24)
+        rep(524288, 24)   # Treatment (runs 25-48)
+    )
+    
+    # Non-standard peptides: all 262144
+    peptide3_intensities <- rep(262144, 48)
+    
+    input <- create_test_input(standard_intensities, standard_intensities, peptide3_intensities)
+    output <- MSstats:::.normalizeGlobalStandards(input, peptide_dict, standards)
+    
+    # Verify normalization: Control runs should be shifted up, Treatment runs shifted down
+    control_runs <- 1:24
+    treatment_runs <- 25:48
+    
+    # Check Control group (shifted up to match treatment standard)
+    control_abundance <- output[RUN %in% control_runs & 
+                                    !is.na(ABUNDANCE) & 
+                                    !grepl(standards, PEPTIDE)]$ABUNDANCE
+    expect_true(all(abs(control_abundance - 18.5) < 1e-10),
+                info = "Control group non-standard peptides should be normalized to 18.5")
+    
+    # Check Treatment group (shifted down to match control standard)
+    treatment_abundance <- output[RUN %in% treatment_runs & 
+                                      !is.na(ABUNDANCE) & 
+                                      !grepl(standards, PEPTIDE)]$ABUNDANCE
+    expect_true(all(abs(treatment_abundance - 17.5) < 1e-10),
+                info = "Treatment group non-standard peptides should be normalized to 17.5")
+}
+
+# Test 2: Standards with alternating intensities within fractions ---------------
+test_alternating_intensities_within_fractions <- function() {
+    peptide_dict <- create_peptide_dictionary()
+    standards <- c("AAAAAAAAAAAAAAGAGAGAK")
+    
+    # Standard alternates between 262144 and 524288 within each fraction
+    standard_intensities <- rep(c(262144, 524288), 24)
+    
+    # Non-standard peptides: all 262144
+    peptide3_intensities <- rep(262144, 48)
+    
+    input <- create_test_input(standard_intensities, standard_intensities, peptide3_intensities)
+    output <- MSstats:::.normalizeGlobalStandards(input, peptide_dict, standards)
+    
+    # When standards vary within fractions but average to same level,
+    # no net normalization should occur
+    all_runs <- 1:48
+    normalized_abundance <- output[RUN %in% all_runs & 
+                                       !is.na(ABUNDANCE) & 
+                                       !grepl(standards, PEPTIDE)]$ABUNDANCE
+    
+    expect_true(all(abs(normalized_abundance - 18) < 1e-10),
+                info = "No normalization should occur when standard averages are equal across fractions")
+}
+
+# Run tests ---------------------------------------------------------------------
+test_different_group_intensities()
+test_alternating_intensities_within_fractions()