Lab 7: Multiple tests

Contents

Lab 7: Multiple tests#

We generate gene expression data for 4 genes

data = data.frame(group=c(rep(0,10),rep(1,10)), 
                  gene1 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=2.2, sd=1)), 
                  gene2 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=0.2, sd=1)), 
                  gene3 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=0.2, sd=1)), 
                  gene4 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=2.2, sd=1)))
data

A data.frame: 20 × 5
group	gene1	gene2	gene3	gene4
<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
0	1.19811555	0.22956702	-1.08419662	0.084612503
0	-1.62858524	0.40274568	0.60350461	-0.004655093
0	1.28312335	0.45731800	1.11287792	-0.242972661
0	-1.73542813	-0.12757043	0.64004036	1.506662639
0	-1.52756722	-0.55560277	-0.76651793	-0.814627032
0	1.70326561	-0.24244945	1.83036184	1.897556935
0	0.40745837	-0.82583041	-0.74766860	-1.324451805
0	1.01340378	0.58350175	0.66564146	0.563402181
0	0.68077782	0.99151903	1.06470906	-0.363952592
0	-0.08139574	-1.60098568	0.21443631	-0.432589686
1	2.04564111	-0.43029224	0.06848784	2.692641685
1	1.71044593	-1.57980222	-0.42685934	2.532929742
1	2.50740658	0.31919092	-0.99266925	2.223681929
1	0.82129496	-0.57857839	0.79434782	0.016013616
1	2.33681524	-0.11799089	0.41078497	2.034572985
1	1.37558868	0.55264759	0.86203948	2.847815435
1	0.48319564	0.07522746	-0.42695685	3.503233034
1	2.56424872	0.76410435	-1.00304953	1.736595122
1	0.56105148	0.14965757	1.26222945	2.921598810
1	2.33116927	-0.53766273	0.71050272	3.026838581

Bonferroni correction#

numtest = 4
pvalue = 1:numtest
for(i in 1:numtest){
    pvalue[i] = t.test(data[,i+1] ~ data[,1])$p.value
}

print("the Bonferroni adjusted pvalues")
pvalue*numtest

[1] "the Bonferroni adjusted pvalues"

0.0258123799908135
3.33070040915647
2.28063580630899
0.000263727859964977

False Discovery Rate (FDR)#

To perform the FDR analysis in R, we need to install the package fuzzySim by typing install.packages(“fuzzySim”) in R.

library(fuzzySim)
data = data.frame(group=c(rep(0,10),rep(1,10)), 
                  gene1 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=2.2, sd=1)), 
                  gene2 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=0.2, sd=1)), 
                  gene3 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=0.2, sd=1)), 
                  gene4 = c(rnorm(10, mean=0.2, sd=1), rnorm(10, mean=2.2, sd=1)))
data

FDR(data = data, sp.cols = 1, var.cols = 2:5, family = "gaussian")

A data.frame: 20 × 5
group	gene1	gene2	gene3	gene4
<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
0	1.3976480	-0.92167587	-1.19766634	0.1102712
0	-1.3098798	1.76548966	2.08692739	1.7130645
0	-2.4764001	-0.57216022	1.03660571	0.2735409
0	2.1952086	0.73057251	0.47289168	0.9296668
0	0.4096177	2.02222605	0.92259260	1.7423485
0	0.6431298	0.09217703	0.44054298	0.8641446
0	-1.1396500	-1.72277767	0.42122540	1.1466734
0	-0.1282685	-1.12118363	0.74001950	1.1441603
0	-0.2723610	0.87682753	0.32920059	0.8874890
0	0.6993678	-1.22104881	1.68427623	1.8834403
1	2.1478765	-0.48465834	0.51077265	2.4620965
1	3.5315927	1.05095053	-1.14377727	0.5075770
1	2.8735884	1.62357534	0.09842164	3.4708681
1	3.0041415	0.34208320	0.89066474	3.7160819
1	2.7695848	-0.28623747	0.48537107	1.1132312
1	2.4831004	-0.04640451	-0.54516112	2.1082411
1	0.2734891	0.64293045	0.59037663	0.1854209
1	1.2071360	-0.99812340	-0.34836152	0.9970667
1	1.4916754	0.68135585	0.38959437	2.0518593
1	2.3509516	0.25828375	1.11710360	4.7861414

Bivariate p-values adjusted with 'fdr' correction;
3 variable(s) excluded, 1 selected (with q = 0.05)

$exclude

A data.frame: 3 × 5
	bivariate.coeff	AIC	BIC	p.value	p.adjusted
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
gene4	0.18370191	30.65284	32.64431	0.03582045	0.07164089
gene3	-0.19497309	33.02700	35.01846	0.16833496	0.22444661
gene2	0.06747109	34.64259	36.63406	0.55209172	0.55209172

$select

A data.frame: 1 × 5
	bivariate.coeff	AIC	BIC	p.value	p.adjusted
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
gene1	0.2209951	21.61472	23.60618	3.353635e-05	0.0001341454

Microarray gene expression analysis#

To analyze gene expression data, we need to install Bioconductor packages. To install Bioconductor packages, type the following in an R command window:

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install(c("affy","limma","arrays"))

We first import data from the package arrays.

library(affy)   # Affymetrix pre-processing
library(limma)  # two-color pre-processing; differential
                  # expression
                
## import "phenotype" data, describing the experimental design
phenoData <- read.AnnotatedDataFrame(system.file("extdata", "pData.txt", package="arrays"))
celfiles <- system.file("extdata", package="arrays")

Loading required package: BiocGenerics

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min

Loading required package: Biobase

Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.

Warning message:
“package ‘limma’ was built under R version 4.2.2”

Attaching package: ‘limma’

The following object is masked from ‘package:BiocGenerics’:

    plotMA

RMA normalization for microarray expression data

eset <- justRMA(phenoData=phenoData, celfile.path=celfiles)

'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.r-project.org

'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.r-project.org

installing the source package ‘hgfocuscdf’

Warning message in install.packages(cdfname, lib = lib, repos = repositories(), :
“installation of package ‘hgfocuscdf’ had non-zero exit status”

Error in getCdfInfo(object): Could not obtain CDF environment, problems encountered:
Specified environment does not contain HG-Focus
Library - package hgfocuscdf not installed
Library - package hgfocuscdf not installed
Traceback:

1. justRMA(phenoData = phenoData, celfile.path = celfiles)
2. just.rma(filenames = l$filenames, phenoData = l$phenoData, description = l$description, 
 .     notes = notes, compress = compress, rm.mask = rm.mask, rm.outliers = rm.outliers, 
 .     rm.extra = rm.extra, verbose = verbose, normalize = normalize, 
 .     background = background, bgversion = bgversion, destructive = destructive, 
 .     cdfname = cdfname)
3. pmindex(tmp)
4. pmindex(tmp)
5. .local(object, ...)
6. indexProbes(object, "pm", genenames = genenames)
7. indexProbes(object, "pm", genenames = genenames)
8. .local(object, which, ...)
9. getCdfInfo(object)
10. stop(paste("Could not obtain CDF environment, problems encountered:", 
  .     paste(unlist(badOut), collapse = "\n"), sep = "\n"))

Identify differentially expressed genes

combn <- factor(paste(pData(phenoData)[,1], pData(phenoData)[,2], sep = "_"))
design <- model.matrix(~combn) # describe model to be fit

fit <- lmFit(eset, design)  # fit each probeset to model
efit <- eBayes(fit)        # empirical Bayes adjustment
topTable(efit, coef=2)      # table of differentially expressed probesets