longteng

随笔记录, 工作日记.
正文

GSVA

(2024-11-22 17:47:45) 下一个

https://bioconductor.org/packages/release/bioc/html/TCGAbiolinks.html

library(TCGAbiolinks)

 

library(maftools)

 

getGDCprojects()

query <- GDCquery(

    project = "TCGA-HNSC",

    data.category = "Transcriptome Profiling", experimental.strategy = 'RNA-Seq',

    data.type = "Gene Expression Quantification",

    workflow.type = "HTSeq - Counts")

GDCdownload(query)

d <- getResults(query)

HNSC.Rnaseq.SE <- GDCprepare(query)

HNSCMatrix <- assay(HNSC.Rnaseq.SE,"unstranded")

HNSC.RNAseq_CorOutliers <- TCGAanalyze_Preprocessing(HNSC.Rnaseq.SE)

library(TCGAbiolinks)

 

# normalization of genes

dataNorm <- TCGAanalyze_Normalization(

    tabDF = HNSC.RNAseq_CorOutliers,

    geneInfo =  geneInfoHT

)

 

# quantile filter of genes

dataFilt <- TCGAanalyze_Filtering(

    tabDF = dataNorm,

    method = "quantile", 

    qnt.cut =  0.25

)

 

# selection of normal samples "NT"

samplesNT <- TCGAquery_SampleTypes(

    barcode = colnames(dataFilt),

    typesample = c("NT")

)

 

# selection of tumor samples "TP"

samplesTP <- TCGAquery_SampleTypes(

    barcode = colnames(dataFilt), 

    typesample = c("TP")

)

 

# Diff.expr.analysis (DEA)

dataDEGs <- TCGAanalyze_DEA(

    mat1 = dataFilt[,samplesNT],

    mat2 = dataFilt[,samplesTP],

    Cond1type = "Normal",

    Cond2type = "Tumor",

    fdr.cut = 0.01 ,

    logFC.cut = 1,

    method = "glmLRT"

)

 

# DEGs table with expression values in normal and tumor samples

dataDEGsFiltLevel <- TCGAanalyze_LevelTab(

    FC_FDR_table_mRNA = dataDEGs,

    typeCond1 = "Tumor",

    typeCond2 = "Normal",

    TableCond1 = dataFilt[,samplesTP],

    TableCond2 = dataFilt[,samplesNT]

)

 

 

https://alexslemonade.github.io/refinebio-examples/03-rnaseq/pathway-analysis_rnaseq_03_gsva.html#24_About_the_dataset_we_are_using_for_this_example

 

 

if (!("DESeq2" %in% installed.packages())) {

  # Install this package if it isn't installed yet

  BiocManager::install("DESeq2", update = FALSE)

}

 

if (!("GSVA" %in% installed.packages())) {

  # Install this package if it isn't installed yet

  BiocManager::install("GSVA", update = FALSE)

}

 

if (!("qusage" %in% installed.packages())) {

  # Install this package if it isn't installed yet

  BiocManager::install("qusage", update = FALSE)

}

 

if (!("org.Hs.eg.db" %in% installed.packages())) {

  # Install this package if it isn't installed yet

  BiocManager::install("org.Hs.eg.db", update = FALSE)

}

 

if (!("pheatmap" %in% installed.packages())) {

  # Install pheatmap

  install.packages("pheatmap", update = FALSE)

}

 

 

library(DESeq2)

 

# Attach the `qusage` library

library(qusage)

 

# Attach the `GSVA` library

library(GSVA)

 

# Human annotation package we'll use for gene identifier conversion

library(org.Hs.eg.db)

 

# We will need this so we can use the pipe: %>%

library(magrittr)

library(msigdbr)

Prepare data for DESeq2

dds <- DESeqDataSetFromMatrix(

  countData = expression_df, # Our prepped data frame with counts

  colData = metadata, # Data frame with annotation for our samples

  design = ~1 # Here we are not specifying a model

)

dds_norm <- vst(dds)

vst_df <- assay(dds_norm) %>%

  as.data.frame() %>% # Make into a data frame

  tibble::rownames_to_column("gene")

names(vst_df)[1] <- "SYMBOL"

hallmark_gene_sets <- msigdbr::msigdbr(

  species = "Homo sapiens", # Can change this to what species you need

  category = "H" # Only hallmark gene sets

)

head(hallmark_gene_sets)

gs_cat

<chr>

gs_subcat

<chr>

gs_name

<chr>

gene_symbol

<chr>

entrez_gene

<int>

ensembl_gene

<chr>

 

H

 

HALLMARK_ADIPOGENESIS

ABCA1

19

ENSG00000165029

 

H

 

HALLMARK_ADIPOGENESIS

ABCB8

11194

ENSG00000197150


 

 

hallmarks_list <- split(

  hallmark_gene_sets$entrez_gene, # The genes we want split into pathways

  hallmark_gene_sets$gs_name # The pathways made as the higher levels of the list

)

## $HALLMARK_ADIPOGENESIS

##   [1]     19  11194  10449     33     34     35     47     50     51

##  [10]    112 149685   9370  79602  79602  56894   9131    204    217

 

keytypes(org.Hs.eg.db)

##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT" 

##  [5] "ENSEMBLTRANS" "ENTREZID"     "ENZYME"       "EVIDENCE"    

##  [9] "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       

## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"    

## [17] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        

## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      

## [25] "UNIGENE"      "UNIPROT"

msigdbr_collections()

# A tibble: 23 x 3

   gs_cat gs_subcat         num_genesets

   <chr>  <chr>                    <int>

 1 C1     ""                         299

 2 C2     "CGP"                     3384

 3 C2     "CP"                        29

 4 C2     "CP:BIOCARTA"              292

 5 C2     "CP:REACTOME"                  186

 6 C2     "CP:PID"                   196

 7 C2     "CP:REACTOME"             1615

 8 C2     "CP:WIKIPATHWAYS"          664

 9 C3     "MIR:MIRDB"               2377

10 C3     "MIR:MIR_Legacy"           221

cgp_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CGP")

cgp_list <- split(

    cgp_gene_sets$entrez_gene, # The genes we want split into pathways

    cgp_gene_sets$gs_name # The pathways made as the higher levels of the list

)

REACTOME_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")

REACTOME_list <- split(

    REACTOME_gene_sets$entrez_gene, # The genes we want split into pathways

    REACTOME_gene_sets$gs_name # The pathways made as the higher levels of the list

)

BIOCARTA_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:BIOCARTA")

BIOCARTA_list <- split(

BIOCARTA_gene_sets$entrez_gene, # The genes we want split into pathways

BIOCARTA_gene_sets$gs_name # The pathways made as the higher levels of the list

)

mapped_df <- data.frame(

    "entrez_id" = mapIds(

        # Replace with annotation package for the organism relevant to your data

        org.Hs.eg.db,

        keys = vst_df$SYMBOL,

        # Replace with the type of gene identifiers in your data

        keytype = "SYMBOL",

        # Replace with the type of gene identifiers you would like to map to

        column = "ENTREZID",

        # This will keep only the first mapped value for each Ensembl ID

        multiVals = "first"

    )

) %>%

    # If an Ensembl gene identifier doesn't map to a Entrez gene identifier,

    # drop that from the data frame

    dplyr::filter(!is.na(entrez_id)) %>%

    # Make an `Ensembl` column to store the row names

    tibble::rownames_to_column("SYMBOL") %>%

    # Now let's join the rest of the expression data

    dplyr::inner_join(vst_df, by = c("SYMBOL" = "SYMBOL"))

mapped_df[1:3, 1:5]

entrez_id[1945] <- NULL #remove list

entrez_id1 <- as_tibble(entrez_id)

which(is.na(filtered_mapped_matrix$entrez_id)) # check missing value

  SYMBOL entrez_id TCGA.4P.AA8J.01A.11R.A39I.07 TCGA.BA.4074.01A.01R.1436.07 TCGA.BA.4075.01A.01R.1436.07

1   A1BG         1                     7.761813                     7.828986                     7.020303

2   A1CF     29974                     5.174723                     5.178919                     5.186378

3  A2ML1    144568                    11.034088                     8.121422                     6.562782

gene_means <- rowMeans(mapped_df %>% dplyr::select(-SYMBOL, -entrez_id))

mapped_df <- mapped_df %>%

    # Add gene_means as a column called gene_means

    dplyr::mutate(gene_means) %>%

    # Reorder the columns so `gene_means` column is upfront

    dplyr::select(SYMBOL, entrez_id, gene_means, dplyr::everything())

filtered_mapped_df <- mapped_df %>%

  # Sort so that the highest mean expression values are at the top

  dplyr::arrange(dplyr::desc(gene_means)) %>%

  # Filter out the duplicated rows using `dplyr::distinct()`

  dplyr::distinct(entrez_id, .keep_all = TRUE)

sum(duplicated(filtered_mapped_df$SYMBOL))

 

filtered_mapped_matrix <- filtered_mapped_df %>%

    # GSVA can't the Ensembl IDs so we should drop this column as well as the means

    dplyr::select(-SYMBOL, -gene_means) %>%

    # We need to store our gene identifiers as row names

    tibble::column_to_rownames("entrez_id") %>%

    # Now we can convert our object into a matrix

    as.matrix()

gsva_results <- gsva(

    filtered_mapped_matrix,

    hallmarks_list,

    method = "gsva",

    # Appropriate for our vst transformed data

    kcdf = "Gaussian",

    # Minimum gene set size

    min.sz = 15,

    # Maximum gene set size

    max.sz = 500,

    # Compute Gaussian-distributed scores

    mx.diff = TRUE,

    # Don't print out the progress bar

    verbose = FALSE

)

tgsva_REACTOME_results <- as.data.frame(t(gsva_REACTOME_results))

tgsva_REACTOME_results$ID <- rownames(tgsva_REACTOME_results)

gsva_REACTOME_results1 <- merge(metadata, tgsva_REACTOME_results, by = "ID")

ddd1 <- gsva_REACTOME_results1[, 2:52] %>%

    gather(key = variable, value = value, - group) %>% 

    group_by(group, variable) %>% 

    summarise(value = list(value)) %>% 

    spread(group, value) %>% 

    group_by(variable) %>% 

    mutate(p_value = t.test(unlist(TT), unlist(NT))$p.value,

           t_value = t.test(unlist(TT), unlist(NT))$statistic)

ggplot(ddd1, aes(reorder(variable, t_value), t_value)) +

    geom_col(aes(fill=p_value<0.05)) +

    coord_flip() +

    labs(x="Pathway", y="t_value of GSVA score ",

         title="GSVA_REACTOME in HNSC_TT_vs_NT ") + 

    theme_minimal()

 

library(annotables)

grch38 %>% filter(ensgene %in%  )

[ 打印 ]
阅读 ()评论 (0)
评论
目前还没有任何评论
登录后才可评论.