Lab 7: Multiple tests#

We generate gene expression data for 4 genes

data = data.frame(group=c(rep(0,10),rep(1,10)), 
                  gene1 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=2.2, sd=1)), 
                  gene2 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=0.2, sd=1)), 
                  gene3 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=0.2, sd=1)), 
                  gene4 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=2.2, sd=1)))
data
A data.frame: 20 × 5
groupgene1gene2gene3gene4
<dbl><dbl><dbl><dbl><dbl>
0 1.19811555 0.22956702-1.08419662 0.084612503
0-1.62858524 0.40274568 0.60350461-0.004655093
0 1.28312335 0.45731800 1.11287792-0.242972661
0-1.73542813-0.12757043 0.64004036 1.506662639
0-1.52756722-0.55560277-0.76651793-0.814627032
0 1.70326561-0.24244945 1.83036184 1.897556935
0 0.40745837-0.82583041-0.74766860-1.324451805
0 1.01340378 0.58350175 0.66564146 0.563402181
0 0.68077782 0.99151903 1.06470906-0.363952592
0-0.08139574-1.60098568 0.21443631-0.432589686
1 2.04564111-0.43029224 0.06848784 2.692641685
1 1.71044593-1.57980222-0.42685934 2.532929742
1 2.50740658 0.31919092-0.99266925 2.223681929
1 0.82129496-0.57857839 0.79434782 0.016013616
1 2.33681524-0.11799089 0.41078497 2.034572985
1 1.37558868 0.55264759 0.86203948 2.847815435
1 0.48319564 0.07522746-0.42695685 3.503233034
1 2.56424872 0.76410435-1.00304953 1.736595122
1 0.56105148 0.14965757 1.26222945 2.921598810
1 2.33116927-0.53766273 0.71050272 3.026838581

Bonferroni correction#

numtest = 4
pvalue = 1:numtest
for(i in 1:numtest){
    pvalue[i] = t.test(data[,i+1] ~ data[,1])$p.value
}

print("the Bonferroni adjusted pvalues")
pvalue*numtest
[1] "the Bonferroni adjusted pvalues"
  1. 0.0258123799908135
  2. 3.33070040915647
  3. 2.28063580630899
  4. 0.000263727859964977

False Discovery Rate (FDR)#

To perform the FDR analysis in R, we need to install the package fuzzySim by typing install.packages(“fuzzySim”) in R.

library(fuzzySim)
data = data.frame(group=c(rep(0,10),rep(1,10)), 
                  gene1 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=2.2, sd=1)), 
                  gene2 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=0.2, sd=1)), 
                  gene3 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=0.2, sd=1)), 
                  gene4 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=2.2, sd=1)))
data

FDR(data = data, sp.cols = 1, var.cols = 2:5, family = "gaussian")
A data.frame: 20 × 5
groupgene1gene2gene3gene4
<dbl><dbl><dbl><dbl><dbl>
0 1.3976480-0.92167587-1.197666340.1102712
0-1.3098798 1.76548966 2.086927391.7130645
0-2.4764001-0.57216022 1.036605710.2735409
0 2.1952086 0.73057251 0.472891680.9296668
0 0.4096177 2.02222605 0.922592601.7423485
0 0.6431298 0.09217703 0.440542980.8641446
0-1.1396500-1.72277767 0.421225401.1466734
0-0.1282685-1.12118363 0.740019501.1441603
0-0.2723610 0.87682753 0.329200590.8874890
0 0.6993678-1.22104881 1.684276231.8834403
1 2.1478765-0.48465834 0.510772652.4620965
1 3.5315927 1.05095053-1.143777270.5075770
1 2.8735884 1.62357534 0.098421643.4708681
1 3.0041415 0.34208320 0.890664743.7160819
1 2.7695848-0.28623747 0.485371071.1132312
1 2.4831004-0.04640451-0.545161122.1082411
1 0.2734891 0.64293045 0.590376630.1854209
1 1.2071360-0.99812340-0.348361520.9970667
1 1.4916754 0.68135585 0.389594372.0518593
1 2.3509516 0.25828375 1.117103604.7861414
Bivariate p-values adjusted with 'fdr' correction;
3 variable(s) excluded, 1 selected (with q = 0.05)
$exclude
A data.frame: 3 × 5
bivariate.coeffAICBICp.valuep.adjusted
<dbl><dbl><dbl><dbl><dbl>
gene4 0.1837019130.6528432.644310.035820450.07164089
gene3-0.1949730933.0270035.018460.168334960.22444661
gene2 0.0674710934.6425936.634060.552091720.55209172
$select
A data.frame: 1 × 5
bivariate.coeffAICBICp.valuep.adjusted
<dbl><dbl><dbl><dbl><dbl>
gene10.220995121.6147223.606183.353635e-050.0001341454

Microarray gene expression analysis#

To analyze gene expression data, we need to install Bioconductor packages. To install Bioconductor packages, type the following in an R command window:

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install(c("affy","limma","arrays"))
  1. We first import data from the package arrays.

library(affy)   # Affymetrix pre-processing
library(limma)  # two-color pre-processing; differential
                  # expression
                
## import "phenotype" data, describing the experimental design
phenoData <- read.AnnotatedDataFrame(system.file("extdata", "pData.txt", package="arrays"))
celfiles <- system.file("extdata", package="arrays")
Loading required package: BiocGenerics
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min
Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.
Warning message:
“package ‘limma’ was built under R version 4.2.2”
Attaching package: ‘limma’
The following object is masked from ‘package:BiocGenerics’:

    plotMA
  1. RMA normalization for microarray expression data

eset <- justRMA(phenoData=phenoData, celfile.path=celfiles)
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.r-project.org
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.r-project.org
installing the source package ‘hgfocuscdf’
Warning message in install.packages(cdfname, lib = lib, repos = repositories(), :
“installation of package ‘hgfocuscdf’ had non-zero exit status”
Error in getCdfInfo(object): Could not obtain CDF environment, problems encountered:
Specified environment does not contain HG-Focus
Library - package hgfocuscdf not installed
Library - package hgfocuscdf not installed
Traceback:

1. justRMA(phenoData = phenoData, celfile.path = celfiles)
2. just.rma(filenames = l$filenames, phenoData = l$phenoData, description = l$description, 
 .     notes = notes, compress = compress, rm.mask = rm.mask, rm.outliers = rm.outliers, 
 .     rm.extra = rm.extra, verbose = verbose, normalize = normalize, 
 .     background = background, bgversion = bgversion, destructive = destructive, 
 .     cdfname = cdfname)
3. pmindex(tmp)
4. pmindex(tmp)
5. .local(object, ...)
6. indexProbes(object, "pm", genenames = genenames)
7. indexProbes(object, "pm", genenames = genenames)
8. .local(object, which, ...)
9. getCdfInfo(object)
10. stop(paste("Could not obtain CDF environment, problems encountered:", 
  .     paste(unlist(badOut), collapse = "\n"), sep = "\n"))
  1. Identify differentially expressed genes

combn <- factor(paste(pData(phenoData)[,1], pData(phenoData)[,2], sep = "_"))
design <- model.matrix(~combn) # describe model to be fit

fit <- lmFit(eset, design)  # fit each probeset to model
efit <- eBayes(fit)        # empirical Bayes adjustment
topTable(efit, coef=2)      # table of differentially expressed probesets