1) PACKAGES

The main packages used for the analysis are:

# CODE: LOAD MAIN R LIBRARIES

library(MALDIquant)
library(MALDIquantForeign)
library(MALDIrppa)
library(cluster)
library(scales)
library(ggplot2)
library(ggrepel)
library(dendextend)
library(mixOmics)
library(lsa)
library(knitr)
library(kableExtra)

2) TASKS AND PARAMETERS

Briefly, the main tasks are reported in the first column (TASK NAME) and the second column (CHOICE) shows if the tasks are selected or not by the user. The third column (METHOD I) and the fourth column (METHOD II) show the methods for the related task. Numeric parameters are collected from the fifth column to the seventh column (PARAMETER I,II,III). Following, the list of tasks, methods and parameters set by the user are:

Task Choice Method I Method II Parameter I Parameter II Parameter III
quality_pre yes
trimming yes 900 2500
quality_post yes
stabilization yes sqrt
smoothing yes SavitzkyGolay 10
baseline yes SNIP 25 75
normalization yes TIC
average yes mean
alignment yes MAD lowess 20 2.0 0.002
peak yes strict 0.2
pca yes
clustering yes average gap 3
heatmap yes 1
reporting yes yes

3) MASS SPECTRA ACQUISITION

The MALDI-TOF mass spectra uploaded by the user are 40:

# CODE: A) IMPORT MASS SPECTRA, B) LIST MASS SPECTRA FILES

# A)
spectra = MALDIquant::import(path_msd,type = "auto",removeEmptySpectra = TRUE,verbose = TRUE)

# B)
files = list.files(files_path)
# CODE: SHOW MASS SPECTRA FILES

files
##  [1] "13S13569_A_SP.txt" "13S13569_B_SP.txt" "13S13569_C_SP.txt"
##  [4] "13S13569_D_SP.txt" "13S14986_A_SP.txt" "13S14986_B_SP.txt"
##  [7] "13S14986_C_SP.txt" "13S14986_D_SP.txt" "13S15816_A_SP.txt"
## [10] "13S15816_B_SP.txt" "13S15816_C_SP.txt" "13S15816_D_SP.txt"
## [13] "13S15947_A_SP.txt" "13S15947_B_SP.txt" "13S15947_C_SP.txt"
## [16] "13S15947_D_SP.txt" "C103_A_SP.txt"     "C103_B_SP.txt"    
## [19] "C103_C_SP.txt"     "C103_D_SP.txt"     "C128_A_SP.txt"    
## [22] "C128_B_SP.txt"     "C128_C_SP.txt"     "C128_D_SP.txt"    
## [25] "C131_A_SP.txt"     "C131_B_SP.txt"     "C131_C_SP.txt"    
## [28] "C131_D_SP.txt"     "C132_A_SP.txt"     "C132_B_SP.txt"    
## [31] "C132_C_SP.txt"     "C132_D_SP.txt"     "C133_A_SP.txt"    
## [34] "C133_B_SP.txt"     "C133_C_SP.txt"     "C133_D_SP.txt"    
## [37] "C135_A_SP.txt"     "C135_B_SP.txt"     "C135_C_SP.txt"    
## [40] "C135_D_SP.txt"

All the mass spectra have been converted in the S4 class type ‘MassSpectrum’, with the information for each mass spectrum about:

The information of the first mass spectrum is reported for showing the ’MassSpectrum" class type:

# CODE: A) EXTRACT NAME AND EXTENSION FILES, B) APPLY METADATA TO MASS-SPECTRUM CLASS

# A)
target = read.table("targetfile.txt")
id = tools::file_path_sans_ext(files)
ext = tools::file_ext(files)

# B)        
# Create metadata
for(k in 1:(nrow(target))){
  MALDIquant::metaData(spectra[[k]])$sample = target[k,2]
  MALDIquant::metaData(spectra[[k]])$run    = target[k,3]
  MALDIquant::metaData(spectra[[k]])$group  = target[k,4]
  MALDIquant::metaData(spectra[[k]])$ext    = ext[k]
}
# CODE: A) SHOW MASSSPECTRUM CLASS, B) SHOW APPLIED METADATA

# A)
metaData(spectra[[1]])$file = NULL
spectra[[1]]
## S4 class type            : MassSpectrum     
## Number of m/z values     : 41376            
## Range of m/z values      : 100.09 - 3985.992
## Range of intensity values: 0e+00 - 2.988e+03
## Memory usage             : 649.766 KiB
# B)
metaData(spectra[[1]])
## $sample
## [1] 13S13569
## 10 Levels: 13S13569 13S14986 13S15816 13S15947 C103 C128 C131 ... C135
## 
## $run
## [1] 1
## 
## $group
## [1] N
## Levels: D N
## 
## $ext
## [1] "txt"

4) QUALITY CONTROL

The quality control on the uploaded mass spectra can be performed before and after the trimming task, in order to control if the mass spectra trimming affects the number of mass spectra that can be outliers for the dataset.

4.1) QUALITY CONTROL PRE-TRIMMING

The numbers of m/z values is shown per mass spectrum. Moreover, a control on empty mass spectra and on frequency of m/z values per mass spectrum (over a certain threshold) is reported. We suggest to remove empty mass spectra from the analysis; irregularities in frequency do not affect the following analysis.

## 
## 37895 41145 41376 41935 42246 42694 43887 45135 46132 46237 46352 46544 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## 46617 48881 49038 49263 49503 49745 49858 50397 50485 50735 50881 51259 
##     1     1     1     1     1     1     1     1     1     1     1     4 
## 51458 51529 51578 52918 53484 53580 53674 53785 54902 56302 56668 56829 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## 60424 
##     1

Any empty mass spectrum? FALSE

Are all the mass spectra regular? FALSE

#CODE: LOG CONTROL FILE CREATION

# Simple check
len = sapply(spectra,length)
empty = sapply(spectra,MALDIquant::isEmpty)
regular = sapply(spectra,MALDIquant::isRegular)
    
# Writing control log file
file_con = file("log_control.txt","w")
writeLines("I) PARAMETERS:", file_con, sep = "\n")
for(k in 1:nrow(attr_file)){
  writeLines(apply(attr_file[k,],1,paste,collapse = " "),file_con)
}
writeLines("\n\nII) UPLOADED FILES:", file_con, sep = "\n")
writeLines(files, file_con)
writeLines("\n\nIII) SAMPLES:", file_con, sep = "\n")
writeLines(as.character(target[,2]), file_con)
writeLines("\n\nIV) REPLICATES:", file_con, sep = "\n")
writeLines(as.character(target[,3]), file_con)
writeLines("\n\nV) NUMBER OF M/Z VALUES:", file_con, sep = "\n")
writeLines(as.character(len), file_con)
writeLines("\n\nVI) RANGE OF M/Z VALUES:", file_con, sep = "\n")
for(kk in 1:length(spectra)){
  writeLines(paste(as.character(format(round(min(MALDIquant::mass(spectra[[kk]])), 5), nsmall = 5)),"/", 
                   as.character(format(round(max(MALDIquant::mass(spectra[[kk]])), 5), nsmall = 5)) ), file_con)
}
writeLines("\n\nVII) INTENSITY OF M/Z VALUES:", file_con, sep = "\n")
for(kkk in 1:length(spectra)){
  writeLines(paste(as.character(format(round(min(MALDIqaunt::intensity(spectra[[kkk]])), 5), nsmall = 5)),"/", 
                   as.character(format(round(max(MALDIquant::intensity(spectra[[kkk]])), 5), nsmall = 5)) ),file_con)
}
writeLines("\n\nVIII) EMPTY/IRREGULAR MASS SPECTRA: ", file_con, sep = "\n")
writeLines(paste(as.character(empty),"/",as.character(regular)), file_con, sep = "\n")
close(file_con)
# CODE: A) PERFORM QUALITY CONTROL PRE-TRIMMING, B) PLOT QUALITY CONTROL PRE-TRIMMING ATYPICALITY SCORE,
#       C) PLOT RAW MASS SPECTRA

# A)
# Quality control pre-trimming
results = MALDIrppa::screenSpectra(spectra)
sp_code = id
df = data.frame(sp_code = sp_code, x = 1:nrow(results$est.table), y = results$est.table$`A score`)
df$hidden_code = ifelse(df$y > results$lower & df$y < results$upper, "", as.character(df$sp_code))

# B)        
ggplot(df, aes(df$x,df$y)) +
  geom_point(color = ifelse(df$y > results$lower & df$y < results$upper, "darkgreen","darkred")) +
  geom_label_repel(color = ifelse(df$y > results$lower & df$y < results$upper, "darkgreen","darkred"), 
                   aes(label = hidden_code)) +
  geom_hline(yintercept = c(results$upper, results$lower), color = "darkred", linetype = 2) +
  theme_bw() +
  labs(x = "MASS SPECTRUM INDEX", y = "A SCORE", title = "Mass Spectra Screening (Pre Trimming)",
       subtitle = paste0("Number of potentially faulty spectra: ",results$cfailure, " (", results$prop*100, " %)"))
ggsave("qcontrol_pre-trim.png", width = 16.7, height = 12.5, dpi = 300)

# C)        
# Plot all raw mass spectra (in 'Raw' folder)
dir.create("Raw")
for (r in 1:length(spectra)){
  png(paste0("Raw/raw_",id[r],".png"),units = "px",width = 1600,height = 1200,res = 300)
  par(mar = c(4.3,4.1,4.1,2.1))
  plot(spectra[[r]], col = "dark blue")
  title(main = list(paste(id[r],"(raw)",sep = " "),col = "dark red"))
  dev.off()
}

The atypicality score A, calculated by using the Rousseeuw’s Q robust scale estimator, is used in order to provide a range of acceptance for the mass spectra (between the dotted red lines). Only the outliers are depicted with the codename of the mass spectrum.Fig.1

4.2) QUALITY CONTROL POST-TRIMMING

The trimming task is considered done even if the user does not provide a specific range of trimming (the mass spectra are considered in their entirety.) Nevertheless, the quality control after the trimming is still subject to the user’s choice. In this case the mass spectra are trimmed.

# CODE: A) PERFORM TRIMMING, B) PLOT TRIMMED MASS SPECTRA

# A)
if (attr_file[2,2]=="yes"){
  if (attr_file[2,5]=="0" & attr_file[2,6]=="0"){
                
    # M/z max and min for each mass spectrum
    len = length(spectra)
    max_all = min_all = rep(0,len)
    for (k in 1:len){
      max_all[k] = max(MALDIquant::mass(spectra[[k]]))
      min_all[k] = min(MALDiquant::mass(spectra[[k]]))
    }
                
    # Eliminating null values
    beg = min(min_all[min_all!=0])
    end = max(max_all[max_all!=0])
        
  } else {
    beg = as.numeric(attr_file[2,5])
    end = as.numeric(attr_file[2,6])
  }
            
  spectra_trim = MALDIquant::trim(spectra,c(beg,end))
        
# B)
  # Plot all trimmed mass spectra (in 'Trim' folder)
  dir.create("Trim")
  for (r in 1:length(spectra_trim)){
    png(paste0("Trim/trim_",id[r],".png"),units = "px",width = 1600,height = 1200,res = 300)
    par(mar = c(4.3,4.1,4.1,2.1))
    plot(spectra_trim[[r]], col = "dark blue")
    title(main = list(paste(id[r],"(trimmed)",sep = " "),col = "dark red"))
    dev.off()
  }
        
} else {
        
  spectra_trim = spectra
  
}
# CODE: A) PERFORM QUALITY CONTROL POST-TRIMMING, B) PLOT QUALITY CONTROL POST-TRIMMING ATYPICALITY SCORE

# A)
# Quality control post-trimming
results = screenSpectra(spectra_trim)
sp_code = id
df = data.frame(sp_code = sp_code, x = 1:nrow(results$est.table), y = results$est.table$`A score`)
df$hidden_code = ifelse(df$y > results$lower & df$y < results$upper, "", as.character(df$sp_code))

# B)        
ggplot(df, aes(df$x,df$y)) +
  geom_point(color = ifelse(df$y > results$lower & df$y < results$upper, "darkgreen","darkred")) +
  geom_label_repel(color = ifelse(df$y > results$lower & df$y < results$upper, "darkgreen","darkred"), 
                   aes(label = hidden_code)) +
  geom_hline(yintercept = c(results$upper, results$lower), color = "darkred", linetype = 2) +
  theme_bw() +
  labs(x = "MASS SPECTRUM INDEX", y = "A SCORE", title = "Mass Spectra Screening (Post Trimming)",
       subtitle = paste0("Number of potentially faulty spectra: ", results$cfailure, " (", results$prop*100, " %)"))
ggsave("qcontrol_post-trim.png", width = 16.7, height = 12.5, dpi = 300)

The atypicality score A is calculated after the trimming task. We suggest to compare the possible outliers between the two cases (before and after trimming), in order to decide if the related mass spectra should be eliminated from the study or not.