Skip to content

Commit af4a6bb

Browse files
authored
Merge pull request #25 from waldronlab/devel_ga
Suggested a fix in defining study code in the `createStudyTable` function
2 parents ef75ae5 + 599e6fd commit af4a6bb

2 files changed

Lines changed: 72 additions & 10 deletions

File tree

R/describe_curation.R

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -99,17 +99,39 @@ createTaxonTable <- function(dat, n=10){
9999
#' createStudyTable(full.dat)
100100
#' ## kable_styling(kbl(createStudyTable(full.dat))) #for html styling
101101

102-
createStudyTable <-function(dat){
103-
studies <- data.frame(Study=paste0(str_extract(dat$Authors, "[A-Za-z]+[:space:]"), dat$Year),
104-
Condition=dat$Condition,
105-
Cases=dat$`Group 1 sample size`,
106-
Controls=dat$`Group 0 sample size`,
107-
`Study Design`=dat$`Study design`)
108-
studies %>% group_by(Study) %>% summarize(Condition=first(Condition),
109-
Cases=max(Cases),
110-
Controls=max(Controls),
111-
`Study Design`=first(`Study.Design`))
102+
createStudyTable <- function(bsdb.df, includeAlso = NULL) {
103+
# input check
104+
if (!is_null(includeAlso)) {
105+
if (!all(includeAlso %in% colnames(bsdb.df))) {
106+
stop(paste(
107+
"The following columns are not found in the input data frame:",
108+
paste(includeAlso[!(includeAlso %in% colnames(bsdb.df))], collapse = ", ")
109+
))
110+
}
111+
}
112+
# Core of the change is in how study IDs are generated, see function in
113+
# simple.R. NB: the function also fixes DOI links as side effect, now.
114+
115+
bsdb_with_StudyCodes.df <- .make_unique_study_ID(bsdb.df)
116+
117+
# some dplyr-fu to summarize tables, with more recent syntax
118+
study_table_fixed <- bsdb_with_StudyCodes.df %>%
119+
group_by(`Study code`) %>%
120+
reframe(
121+
MaxCases = max(`Group 1 sample size`),
122+
MaxControls = max(`Group 0 sample size`),
123+
across(
124+
all_of(
125+
c("Study design", "Condition", "PMID", "DOI", "URL", includeAlso)
126+
),
127+
.fns = function(x)
128+
paste(unique(x), collapse = "; ")
129+
),
130+
N_signatures = n()
131+
) %>%
132+
relocate(N_signatures, .after = Condition)
112133

134+
return(study_table_fixed)
113135
}
114136

115137
globalVariables(

R/simple.R

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,43 @@ getMostFrequentTaxa <- function(dat, n=10, sig.type=c("both", "increased", "decr
7474
msc.tab <- sort(table(unlist(msc)), decreasing=TRUE)
7575
head(msc.tab, n=n)
7676
}
77+
78+
79+
#' Author: Giacomo Antonello
80+
#' Date: 2025-03-17
81+
#'
82+
#' Description:
83+
#'
84+
#' This function takes a raw bugSigDB input from `bugsigdbr` and generates a
85+
#' unique idenfier as curatedMetagenomicsData does: full last name, initial(s) of
86+
#' first name(s) and year of publication. Additionally, it checks if there are
87+
#' more PMID codes associated with the same ID and adds a .1, .2, for each
88+
#' duplication
89+
#'
90+
91+
.make_unique_study_ID <- function(bsdb.df){
92+
bsdb_with_StudyCode <- bsdb.df %>%
93+
# fix DOIs
94+
mutate(
95+
DOI = ifelse(
96+
test = startsWith(DOI, "10."),
97+
yes = paste0("https://doi.org/", DOI),
98+
no = DOI
99+
),
100+
# create a basic ID
101+
BasicID = paste0(gsub(" ", "", sapply(strsplit(`Authors list`, ", "), "[", 1)), "_", Year)
102+
) %>%
103+
# For each ID found, seach if there are multiple studies
104+
group_by(BasicID) %>%
105+
mutate(
106+
# this is arbitrary, the point is to make sure you can split overlapping
107+
# IDs into one
108+
uniqueRank = as.numeric(as.factor(paste(PMID, DOI, URL, `Authors list`))),
109+
`Study code` = ifelse(uniqueRank > 1, paste(BasicID, uniqueRank - 1, sep = "."), BasicID)
110+
) %>%
111+
ungroup() %>%
112+
select(- BasicID, - uniqueRank) %>%
113+
relocate(`Study code`)
114+
115+
return(bsdb_with_StudyCode)
116+
}

0 commit comments

Comments
 (0)