# Filtering and reformatting the metadata in R.

# Load tidyverse
library(tidyverse)

# Import the metadata
dat <- read_tsv("infant_metadata.tsv")

# Change anywhere it says “not applicable”, “not collected”, and “not provided” to “NA”
na_strings <- c("not applicable", "not collected", "not provided")
metadata <- dat %>%
  replace_with_na_all(condition = ~.x %in% na_strings)

# Add a ‘mom_and_feed’ column to the metadata
mom_and_feed_data <- mutate(metadata,
                            mom_and_feed = ifelse(probiotic_mom == "yes" & feed == "breast",
                                                  "indirect", "no"))

# Add a ‘probiotic_mode’ column to the metadata
probiotic_mode_data <- mutate(mom_and_feed_data,
                             probiotic_mode = ifelse(probiotic_inf == "yes", "direct",
                                                     ifelse(mom_and_feed == "indirect", "indirect", "no")))

# Add a ‘probiotic’ column to the metadata
probiotic_data <- mutate(probiotic_mode_data,
                         probiotic = ifelse(probiotic_mode == "direct" | probiotic_mode == "indirect", "yes", "no"))

# Change anywhere it says “Adult" to “NA”
na_adult <- c("Adult")
no_adult_data <- probiotic_and_eczema_data %>%
  replace_with_na_all(condition = ~.x %in% na_adult)

# Remove rows that say “NA” in the ‘life_stage’ column
remove_adults_data <- no_adult_data %>% drop_na(life_stage)

# Remove duplicated entries (entries with the same anonymized name)
no_duplicates_data <- remove_adults_data[!duplicated(remove_adults_data$anonymized_name),]

# Export the tsv file
write_tsv(no_duplicates_data, "no_duplicates_metadata.tsv")

manifest <- read_tsv("infant_manifest.txt")
metadata_unique_infants <- read_tsv("no_duplicates_metadata.tsv")

manifest_update <- semi_join(manifest, metadata_unique_infants, by = c("sample-id" = "#SampleID"))

write_tsv(manifest_update, "manifest_update.tsv")
