https://bioconductor.org/packages/release/bioc/html/TCGAbiolinks.html
library(TCGAbiolinks)
library(maftools)
getGDCprojects()
query <- GDCquery(
project = "TCGA-HNSC",
data.category = "Transcriptome Profiling", experimental.strategy = 'RNA-Seq',
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts")
GDCdownload(query)
d <- getResults(query)
HNSC.Rnaseq.SE <- GDCprepare(query)
HNSCMatrix <- assay(HNSC.Rnaseq.SE,"unstranded")
HNSC.RNAseq_CorOutliers <- TCGAanalyze_Preprocessing(HNSC.Rnaseq.SE)
library(TCGAbiolinks)
# normalization of genes
dataNorm <- TCGAanalyze_Normalization(
tabDF = HNSC.RNAseq_CorOutliers,
geneInfo = geneInfoHT
)
# quantile filter of genes
dataFilt <- TCGAanalyze_Filtering(
tabDF = dataNorm,
method = "quantile",
qnt.cut = 0.25
)
# selection of normal samples "NT"
samplesNT <- TCGAquery_SampleTypes(
barcode = colnames(dataFilt),
typesample = c("NT")
)
# selection of tumor samples "TP"
samplesTP <- TCGAquery_SampleTypes(
barcode = colnames(dataFilt),
typesample = c("TP")
)
# Diff.expr.analysis (DEA)
dataDEGs <- TCGAanalyze_DEA(
mat1 = dataFilt[,samplesNT],
mat2 = dataFilt[,samplesTP],
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.01 ,
logFC.cut = 1,
method = "glmLRT"
)
# DEGs table with expression values in normal and tumor samples
dataDEGsFiltLevel <- TCGAanalyze_LevelTab(
FC_FDR_table_mRNA = dataDEGs,
typeCond1 = "Tumor",
typeCond2 = "Normal",
TableCond1 = dataFilt[,samplesTP],
TableCond2 = dataFilt[,samplesNT]
)
if (!("DESeq2" %in% installed.packages())) {
# Install this package if it isn't installed yet
BiocManager::install("DESeq2", update = FALSE)
}
if (!("GSVA" %in% installed.packages())) {
# Install this package if it isn't installed yet
BiocManager::install("GSVA", update = FALSE)
}
if (!("qusage" %in% installed.packages())) {
# Install this package if it isn't installed yet
BiocManager::install("qusage", update = FALSE)
}
if (!("org.Hs.eg.db" %in% installed.packages())) {
# Install this package if it isn't installed yet
BiocManager::install("org.Hs.eg.db", update = FALSE)
}
if (!("pheatmap" %in% installed.packages())) {
# Install pheatmap
install.packages("pheatmap", update = FALSE)
}
library(DESeq2)
# Attach the `qusage` library
library(qusage)
# Attach the `GSVA` library
library(GSVA)
# Human annotation package we'll use for gene identifier conversion
library(org.Hs.eg.db)
# We will need this so we can use the pipe: %>%
library(magrittr)
library(msigdbr)
Prepare data for DESeq2
dds <- DESeqDataSetFromMatrix(
countData = expression_df, # Our prepped data frame with counts
colData = metadata, # Data frame with annotation for our samples
design = ~1 # Here we are not specifying a model
)
dds_norm <- vst(dds)
vst_df <- assay(dds_norm) %>%
as.data.frame() %>% # Make into a data frame
tibble::rownames_to_column("gene")
names(vst_df)[1] <- "SYMBOL"
hallmark_gene_sets <- msigdbr::msigdbr(
species = "Homo sapiens", # Can change this to what species you need
category = "H" # Only hallmark gene sets
)
head(hallmark_gene_sets)
gs_cat <chr> | gs_subcat <chr> | gs_name <chr> | gene_symbol <chr> | entrez_gene <int> | ensembl_gene <chr> | |
---|---|---|---|---|---|---|
H | HALLMARK_ADIPOGENESIS | ABCA1 | 19 | ENSG00000165029 | ||
H | HALLMARK_ADIPOGENESIS | ABCB8 | 11194 | ENSG00000197150 |
|
hallmarks_list <- split(
hallmark_gene_sets$entrez_gene, # The genes we want split into pathways
hallmark_gene_sets$gs_name # The pathways made as the higher levels of the list
)
## $HALLMARK_ADIPOGENESIS
## [1] 19 11194 10449 33 34 35 47 50 51
## [10] 112 149685 9370 79602 79602 56894 9131 204 217
keytypes(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"
## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
msigdbr_collections()
# A tibble: 23 x 3
gs_cat gs_subcat num_genesets
<chr> <chr> <int>
1 C1 "" 299
2 C2 "CGP" 3384
3 C2 "CP" 29
4 C2 "CP:BIOCARTA" 292
5 C2 "CP:REACTOME" 186
6 C2 "CP:PID" 196
7 C2 "CP:REACTOME" 1615
8 C2 "CP:WIKIPATHWAYS" 664
9 C3 "MIR:MIRDB" 2377
10 C3 "MIR:MIR_Legacy" 221
cgp_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CGP")
cgp_list <- split(
cgp_gene_sets$entrez_gene, # The genes we want split into pathways
cgp_gene_sets$gs_name # The pathways made as the higher levels of the list
)
REACTOME_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")
REACTOME_list <- split(
REACTOME_gene_sets$entrez_gene, # The genes we want split into pathways
REACTOME_gene_sets$gs_name # The pathways made as the higher levels of the list
)
BIOCARTA_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:BIOCARTA")
BIOCARTA_list <- split(
BIOCARTA_gene_sets$entrez_gene, # The genes we want split into pathways
BIOCARTA_gene_sets$gs_name # The pathways made as the higher levels of the list
)
mapped_df <- data.frame(
"entrez_id" = mapIds(
# Replace with annotation package for the organism relevant to your data
org.Hs.eg.db,
keys = vst_df$SYMBOL,
# Replace with the type of gene identifiers in your data
keytype = "SYMBOL",
# Replace with the type of gene identifiers you would like to map to
column = "ENTREZID",
# This will keep only the first mapped value for each Ensembl ID
multiVals = "first"
)
) %>%
# If an Ensembl gene identifier doesn't map to a Entrez gene identifier,
# drop that from the data frame
dplyr::filter(!is.na(entrez_id)) %>%
# Make an `Ensembl` column to store the row names
tibble::rownames_to_column("SYMBOL") %>%
# Now let's join the rest of the expression data
dplyr::inner_join(vst_df, by = c("SYMBOL" = "SYMBOL"))
mapped_df[1:3, 1:5]
entrez_id[1945] <- NULL #remove list
entrez_id1 <- as_tibble(entrez_id)
which(is.na(filtered_mapped_matrix$entrez_id)) # check missing value
SYMBOL entrez_id TCGA.4P.AA8J.01A.11R.A39I.07 TCGA.BA.4074.01A.01R.1436.07 TCGA.BA.4075.01A.01R.1436.07
1 A1BG 1 7.761813 7.828986 7.020303
2 A1CF 29974 5.174723 5.178919 5.186378
3 A2ML1 144568 11.034088 8.121422 6.562782
gene_means <- rowMeans(mapped_df %>% dplyr::select(-SYMBOL, -entrez_id))
mapped_df <- mapped_df %>%
# Add gene_means as a column called gene_means
dplyr::mutate(gene_means) %>%
# Reorder the columns so `gene_means` column is upfront
dplyr::select(SYMBOL, entrez_id, gene_means, dplyr::everything())
filtered_mapped_df <- mapped_df %>%
# Sort so that the highest mean expression values are at the top
dplyr::arrange(dplyr::desc(gene_means)) %>%
# Filter out the duplicated rows using `dplyr::distinct()`
dplyr::distinct(entrez_id, .keep_all = TRUE)
sum(duplicated(filtered_mapped_df$SYMBOL))
filtered_mapped_matrix <- filtered_mapped_df %>%
# GSVA can't the Ensembl IDs so we should drop this column as well as the means
dplyr::select(-SYMBOL, -gene_means) %>%
# We need to store our gene identifiers as row names
tibble::column_to_rownames("entrez_id") %>%
# Now we can convert our object into a matrix
as.matrix()
gsva_results <- gsva(
filtered_mapped_matrix,
hallmarks_list,
method = "gsva",
# Appropriate for our vst transformed data
kcdf = "Gaussian",
# Minimum gene set size
min.sz = 15,
# Maximum gene set size
max.sz = 500,
# Compute Gaussian-distributed scores
mx.diff = TRUE,
# Don't print out the progress bar
verbose = FALSE
)
tgsva_REACTOME_results <- as.data.frame(t(gsva_REACTOME_results))
tgsva_REACTOME_results$ID <- rownames(tgsva_REACTOME_results)
gsva_REACTOME_results1 <- merge(metadata, tgsva_REACTOME_results, by = "ID")
ddd1 <- gsva_REACTOME_results1[, 2:52] %>%
gather(key = variable, value = value, - group) %>%
group_by(group, variable) %>%
summarise(value = list(value)) %>%
spread(group, value) %>%
group_by(variable) %>%
mutate(p_value = t.test(unlist(TT), unlist(NT))$p.value,
t_value = t.test(unlist(TT), unlist(NT))$statistic)
ggplot(ddd1, aes(reorder(variable, t_value), t_value)) +
geom_col(aes(fill=p_value<0.05)) +
coord_flip() +
labs(x="Pathway", y="t_value of GSVA score ",
title="GSVA_REACTOME in HNSC_TT_vs_NT ") +
theme_minimal()
library(annotables)
grch38 %>% filter(ensgene %in% )