cat IN1_S1_L001_R1_001.fastq IN1_S1_L002_R1_001.fastq > IN1_S1_concatenated_R1_001.fastq
cat IN1_S1_L001_R2_001.fastq IN1_S1_L002_R2_001.fastq > IN1_S1_concatenated_R2_001.fastq
https://bioinformatics-core-shared-training.github.io/RNAseq-R/align-and-count.nb.html
library(readr)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(DESeq2)
mycounts <- read.csv("C:/Users/?/Desktop/mycounts.csv")
metadata <- read.csv("C:/Users/?/Desktop/metadata.csv")
#countData <- mycounts[!(duplicated(mycounts[[1]]) | duplicated(mycounts[[1]], fromLast=TRUE)), ]
table(duplicated(mycounts$symbol))
mycounts<- mycounts[duplicated(mycounts$symbol) == FALSE,]
#rownames(mycounts) = make.names(mycounts$gene, unique=TRUE)
mycounts <- as.data.frame(mycounts)
metadata <- as.data.frame(metadata)
head(mycounts)
head(metadata)
class(mycounts)
class(metadata)
dds <- DESeqDataSetFromMatrix(countData=mycounts,
colData=metadata,
design=~Group,
tidy=TRUE)
dds
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
#smallestGroupSize <- 3
#keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
#dds <- dds[keep,]
#keep <- rowSums(counts(dds) >= 15) >= 34 #remove all genes with counts<15 in more than 75% samples(45*0.75=33.75)
#dds <- dds[keep,]
sizeFactors(dds)
dispersions(dds)
dds <- DESeq(dds)
res <- results(dds, tidy=TRUE)
res = results(dds, contrast=c("Group", "Treat", "CNL"), tidy=TRUE)
res <- as_tibble(res)
res
res %>%
filter(padj<0.05) %>%
write_csv("sigresults.csv")
vsdata <- vst(dds, blind=FALSE)
plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata))) + labs(title=" PCA")
df <- as.data.frame(colData(vsdata)[,c("Treat","Group","D","R"," ")])
H <- c("P53","MYC")
mat <- assay(vsdata)[ H, ]
mat <- as.data.frame(mat)
mat1 <- mat %>% arrange(desc(T20))
pheatmap(mat1, annotation_col=df, cluster_rows=F, cluster_cols=F,main = "heatmap")
plotCounts(dds, gene="EN", intgroup="Group", returnData=TRUE) %>%
ggplot(aes(Group, count)) + geom_boxplot(aes(fill=Group)) + scale_y_log10() + ggtitle("")
vsdata <- varianceStabilizingTransformation(dds) (less row)
#Volcano
p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point()
# Add more simple "theme"
p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point() + theme_minimal()
# Add vertical lines for log2FoldChange thresholds, and one horizontal line for the p-value threshold
p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +
geom_hline(yintercept=-log10(0.05), col="red")
# The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners.
# Add a column to the data frame to specify if they are UP- or DOWN- regulated (log2FoldChange respectively positive or negative)
# add a column of NAs
res$diffexpressed <- "NO"
# if log2Foldchange > 0.6 and pvalue < 0.05, set as "UP"
res$diffexpressed[res$log2FoldChange > 0.6 & res$pvalue < 0.05] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
res$diffexpressed[res$log2FoldChange < -0.6 & res$pvalue < 0.05] <- "DOWN"
# Re-plot but this time color the points with "diffexpressed"
p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed)) + geom_point() + theme_minimal()
# Add lines as before...
p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +
geom_hline(yintercept=-log10(0.05), col="red")
## Change point color
# 1. by resfault, it is assigned to the categories in an alphabetical orresr):
p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))
# 2. to automate a bit: ceate a named vector: the values are the colors to be used, the names are the categories they will be assigned to:
mycolors <- c("blue", "red", "black")
names(mycolors) <- c("DOWN", "UP", "NO")
p3 <- p2 + scale_colour_manual(values = mycolors)
# Now write down the name of genes besires the points...
# Create a new column "reslabel" to res, that will contain the name of genes differentially expressed (NA in case they are not)
res$delabel <- NA
res$delabel[res$diffexpressed != "NO"] <- res$row[res$diffexpressed != "NO"]
ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text()
library(ggrepel)
# plot adding up all layers we have seen so far
ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue", "black", "red")) +
geom_vline(xintercept=c(-0.6, 0.6), col="red") +
geom_hline(yintercept=-log10(0.05), col="red")
plotCounts(dds, gene="", intgroup="Group", returnData=TRUE) %>% ggplot(aes(Group, count)) + geom_boxplot(aes(fill=roup)) + scale_y_log10() + ggtitle("") + scale_x_discrete(name ="Group", limits=c("CNL","miR"))
plotCounts(dds, gene="BTF3", intgroup="Morphologic.Classification", returnData=TRUE) %>% ggplot(aes(Morphologic.Classification, count)) + geom_boxplot(aes(fill=Morphologic.Classification)) + scale_y_log10() + ggtitle(" ") + theme(plot.title = element_text(size = 30)) + scale_x_discrete(name ="Morphologic.Classification", limits=c("m0 ","m1","m2","m3","m4","m5"))
https://www.data-to-viz.com/caveat/boxplot.html
ggplot(data = dat, aes(x=Treatment, y=cell., fill=Treatment)) + scale_x_discrete(name ="Treatment", limits=c("UT","B","A","B+A")) + geom_violin(width=0.8) + geom_boxplot(width=0.8, color="black", alpha=0.8) + theme( legend.position="none", plot.title = element_text(size=15)) + ggtitle("Treatment") + theme(axis.text = element_text(size=12)) + theme(panel.background = element_rect(fill = "white", colour = "grey50"))
library("genefilter")
vsdata <- vst(dds, blind=FALSE)
sum( rowMeans( counts(dds, normalized=TRUE)) > 5 )
plotPCA(vsdata, intgroup="Group")
plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata)))
topVarGenes <- head(order(-rowVars(assay(vsdata))),50)
mat <- assay(vsdata)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
df <- as.data.frame(colData(vsdata)[,c("ID","C","B")])
pheatmap(mat, annotation_col=df, cluster_rows=FALSE, cluster_cols=FALSE)
res1 <- res %>% filter(padj<0.05) %>% arrange(padj)
res2 <- res1[1:100,] %>% arrange(desc(log2FoldChange))
Sig <- res2$row
mat <- assay(vsdata)[ Sig, ]
mat <- mat - rowMeans(mat)
df <- as.data.frame(colData(vsdata)[,c("ID","Group")])
pheatmap(mat, annotation_col=df, cluster_rows=F, cluster_cols=F)
zmat <- scale(tmat1, center = TRUE, scale = TRUE) # upon column
sacPlsda <- opls(vsdata, "Treatment", orthoI = 1)
PLS-DA
library(org.Hs.eg.db)
ens2symbol <- AnnotationDbi::select(org.Hs.eg.db,
key=res$row,
columns="SYMBOL",
keytype="SYMBOL")
ens2symbol <- as_tibble(ens2symbol)
ens2symbol
res2 <- res %>%
dplyr::select(SYMBOL, stat) %>%
na.omit() %>%
distinct() %>%
group_by(SYMBOL) %>%
summarize(stat=mean(stat))
res2
library(fgsea)
ranks <- deframe(res2)
head(ranks, 20)
pathways.hallmark <- gmtPathways("C:/Users/?/Desktop/GSEA dataset/h.all.v2023.1.Hs.symbols.gmt")
fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)
fgseaResTidy <- fgseaRes %>%
as_tibble() %>%
arrange(desc(NES))
# Show in a nice table:
hallmark <- fgseaResTidy %>%
dplyr::select(-leadingEdge, -ES, -nMoreExtreme) %>%
arrange(padj)
hallmark <- as.matrix(hallmark)
write.csv(hallmark, " -NRhallmark.csv")
pathways.hallmark <- gmtPathways("C:/Users/?/Desktop/GSEA dataset/ h.all.v2023.1.Hs.symbols.gmt")
fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)
fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, eps = 0.0,
minSize = 15,
maxSize = 500)
fgseaResTidy <- fgseaRes %>%
as_tibble() %>%
arrange(desc(NES))
# Show in a nice table:
fgseaResTidy %>%
dplyr::select(-leadingEdge, -ES, -nMoreExtreme) %>%
arrange(padj) %>%
DT::datatable()
ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.05)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA") +
theme_minimal()
fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)
go <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>% as_tibble()
Re <- fgsea(pathways=gmtPathways("C:/Users/?/Downloads/c2.cp.reactome.v2023.1.Hs.symbols.gmt"), ranks, nperm=1000) %>%
as_tibble()
go <- as.matrix(go)
write.csv(go, "go.csv")
Re <- as.matrix(Re)
write.csv(Re, "Re.csv")
hallmark <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/h.all.v7.2.symbols.gmt "), ranks, nperm=1000) %>% as_tibble()
hallmark <- as.matrix(hallmark)
write.csv(hallmark, "hallmark B PRE + R vs NR.csv")
plotEnrichment(pathways.hallmark[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]], ranks) + labs(title="HALLMARK_INTERFERON_GAMMA_RESPONSE")
plotEnrichment(pathways.hallmark[["HALLMARK_APOPTOSIS"]], ranks) + labs(title=" HALLMARK_APOPTOSIS")
pathways.immunmark <-gmtPathways("C:/Users/?/Dowloads/c7.all.v2022.1.Hs.symbols.gmt")
cell type signature gene sets
pathways.cellmark <- gmtPathways("C:/Users/?/Downloads/c8.all.v2022.1.Hs.symbols.gmt")
ggplot(Hal, aes(NES, NAME))+ geom_col(aes(fill=NES<0)) + scale_y_discrete(name ="NAME", limits=c("MITOTIC_SPINDLE", "IL6_JAK_STAT3_SIGNALING", "EPITHELIAL_MESENCHYMAL_TRANSITION")) + labs(x="Pathway", y="Normalized Enrichment Score", title="Hallmark pathways NES from GSEA") + theme_minimal()
pathway | pval | padj | NES | size | |
---|---|---|---|---|---|
1 | HALLMARK_E2F_TARGETS | 0.003125 | 0.016 | 1.50255883270908 | 196 |
2 | HALLMARK_P53_PATHWAY | 0.00307692307692308 | 0.016 | 1.46976609146818 | 193 |
ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.05)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA") +
theme_minimal()
fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)
go <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>% as_tibble()
Re <- fgsea(pathways=gmtPathways("C:/Users/?/Downloads/c2.cp.reactome.v2023.1.Hs.symbols.gmt"), ranks, nperm=1000) %>%
as_tibble()
go <- as.matrix(go)
write.csv(go, "go.csv")
Re <- as.matrix(Re)
write.csv(Re, "Re.csv")
hallmark <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/h.all.v7.2.symbols.gmt "), ranks, nperm=1000) %>% as_tibble()
hallmark <- as.matrix(hallmark)
write.csv(hallmark, "hallmark B PRE + R vs NR.csv")
plotEnrichment(pathways.hallmark[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]],
ranks) + labs(title="HALLMARK_INTERFERON_GAMMA_RESPONSE")
plotEnrichment(pathways.hallmark[["HALLMARK_APOPTOSIS"]],
ranks) + labs(title=" HALLMARK_APOPTOSIS")
pathways.immunmark <- gmtPathways("C:/Users/?/Dowloads/c7.all.v2022.1.Hs.symbols.gmt")
cell type signature gene sets
pathways.cellmark <- gmtPathways("C:/Users/?/Downloads/c8.all.v2022.1.Hs.symbols.gmt")
ggplot(Hal, aes(NES, NAME))+ geom_col(aes(fill=NES<0)) + scale_y_discrete(name ="NAME", limits=c("MITOTIC_SPINDLE", "IL6_JAK_STAT3_SIGNALING", "EPITHELIAL_MESENCHYMAL_TRANSITION", "APICAL_SURFACE", "COAGULATION", "ANDROGEN_RESPONSE", "UV_RESPONSE_UP", "HEDGEHOG_SIGNALING", "ADIPOGENESIS", "APOPTOSIS", "APICAL_JUNCTION", "INTERFERON_ALPHA_RESPONSE", "XENOBIOTIC_METABOLISM", "KRAS_SIGNALING_DN", "IL2_STAT5_SIGNALING", "HEME_METABOLISM", "UV_RESPONSE_DN", "INFLAMMATORY_RESPONSE", "MYOGENESIS", "ESTROGEN_RESPONSE_LATE", "KRAS_SIGNALING_UP", "PEROXISOME", "ESTROGEN_RESPONSE_EARLY", "INTERFERON_GAMMA_RESPONSE", "ALLOGRAFT_REJECTION", "GLYCOLYSIS", "HYPOXIA")) +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA") +
theme_minimal()
212 HAY_BONE_MARROW_CD8_T_CELL 0.001317523 0.006216336 -0.548956711 -1.973916357 0 71 c("PARP8", "PATL2", "TTC16", "PTPRC", "MIAT", "KLRG1", "KIF19", "GBP5", "FYN", "LINC00987", "TRGC2", "NR4A2", "ATP8A1", "TRG-AS1", "RASAL3", "PPP2R5C", "PTPN22", "DUSP2", "CD8B", "HERPUD2", "PTPN7", "SLAMF6", "F2R", "SLA", "SH2D1A", "GZMK", "CD8A", "SYNE2", "ATG2A", "GZMH", "A2M-AS1", "SON", "CXCR4", "ZNF683", "LINC01871", "HLA-B", "JMJD6")
=CHAR(34)&A1&CHAR(34)
mat <- assay(vsdata)[ T, ]
mat <- mat - rowMeans(mat)
df <- as.data.frame(colData(vsdata)[,c("ID","Response.Primary","LN.Responder","Responder"," ","Group","Treatment","T.Stage", "N.Stage")])
pheatmap(mat, annotation_col=df, cluster_rows=F, cluster_cols=F)
Removing NA or NaN values
There are two ways to remove missing values:
Extracting values except for NA or NaN values:
Example 1:
· R
x <- c(1, 2, NA, 3, NA, 4) d <- is.na(x) x[! d] |
Output:
[1] 1 2 3 4
Example 2:
· R
x <- c(1, 2, 0 / 0, 3, NA, 4, 0 / 0) x x[! is.na(x)] |
Output:
[1] 1 2 NaN 3 NA 4 NaN
[1] 1 2 3 4
d <- filter(mycounts, X %in% c("ABCB6", "ADORA2B", "AGL"))
xcell
Install
devtools::install_github('dviraran/xCell')
Note: if the installation fails, try setting options(unzip = "internal") before calling the install function (thanks @hermidalc).
Usage
library(xCell)
exprMatrix = read.table(expr_file,header=TRUE,row.names=1, as.is=TRUE)
xCellAnalysis(exprMatrix)
This function performs all three steps in xCell, which can be performed seperately as well:
exprMatrix = read.table(expr_file,header=TRUE,row.names=1, as.is=TRUE)
exprMatrix = read.table(file='C:\Users\?\Desktop\10-18-22HNSC \Xcell.txt', header=TRUE, row.names=1, as.is=TRUE)
https://github.com/dviraran/xCell#install
The best thing to do might actually be to perform GSEA both ways (log2FC and signed pvalue), then load both sets of results into EnrichmentMap (https://apps.cytoscape.org/apps/enrichmentmap), and identify sets selected as significant by both metrics. EnrichmentMap supports a mode where two datasets from GSEA can be loaded in simultaneously and co-displayed. That would probably be the most rigorous way to do it.
34 HALLMARK_OXIDATIVE_PHOSPHORYLATION 0.006535948 0.016263336 -0.494556993 -2.378736178 0 200 c("COX17", "ATP5PO", "MRPL35", "COX6C", "VDAC3", "NDUFA4", "TIMM8B", "MDH1", "ATP5MC1", "COX7A2", "NDUFA2", "SUCLA2", "ATP5PF", "NDUFS4", "PRDX3", "UQCR11", "MRPL15", "ECHS1", "NDUFA8", "MRPS11", "DLD", "NDUFB8", "SLC25A4", "NDUFB6", "NDUFA5", "NDUFA6", "NDUFC2", "ATP5F1C", "COX7B", "CYB5A", "POLR2F", "NDUFB7", "MRPL11", "NDUFB1", "ATP5ME", "NDUFAB1", "NDUFC1", "ATP5PD", "NDUFB3", "BDH2", "NDUFB4", "ACADM", "TIMM10", "UQCRB", "NDUFS3", "SDHD", "SURF1", "TOMM70", "ATP6V0B", "COX15", "MDH2", "CYCS",
9 HALLMARK_G2M_CHECKPOINT 0.006451613 0.016263336 -0.320157329 -1.535067914 0 196 c("MEIS2", "MEIS1", "MAD2L1", "CKS2", "CENPA", "RAD21", "CCNA2", "HMMR", "FBXO5", "BRCA2", "SRSF1", "EGF", "KPNA2", "AURKA", "PURA", "CDK1", "XPO1", "EZH2", "MARCKS", "NDC80", "TFDP1", "ESPL1", "ORC5", "CCNB2", "KIF23", "SNRPD1", "ORC6", "BIRC5", "CDC6", "PBK", "TOP2A", "SQLE", "MKI67", "DBF4", "TTK", "CUL4A", "CKS1B", "DTYMK", "CDC27", "CENPE", "HSPA8", "CDC45", "PTTG1", "CDC20", "KIF2C", "CBX1", "HOXC10", "BUB3", "RAD54L", "NEK2", "KNL1", "SRSF10", "BUB1", "SMC2", "CDKN3", "UBE2C", "PLK1", "RASAL2", GINS2, "AURKB", "SLC12A2", "HIRA", "INCENP", "CDC7")
An example of a rank metric is the product of the sign of the ‘direction’ in the expression change (i.e. ‘up-regulation’ is positive and ‘down-regulation’ is negative) and the p-value (P).
si=s(Pi)=sign(fold change gene i)⋅−log10(Pi)
Under this rank metric, up-regulated genes with relatively small p-values appear at the top of the list and down-regulated genes with small p-values at the bottom.
dds2 <- DESeqDataSetFromMatrix(countData=mycounts,
colData=metadata,
design=~Group,
tidy=TRUE)
fpm2 <- fpm(dds2)
pheatmap
dat <- read.csv("C:/Users/?/Desktop/xcell/B .csv")
dat[is.na(dat)] = 0
dat1 <- as.matrix(dat[, 3:14])
rownames(dat1) <- dat$ID
dat2 <- scale(dat1)
pheatmap(dat2,cluster_cols = F,cluster_rows = F,main = "")
annot_row <- data.frame(row.names = data$ID, Group = data$Group)
pheatmap(data2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F,main = "B P + R vs NR")
A <- c("KDR", "JAG1", "COL3A1", "COL5A2", "PDGFB", "CD34", "LAMA2", "CHI3L1", "ITGAV", "HSPG2", "PECAM1", "NID1", "TIMP1", "ANGPT2", "CCL2", "IGF1", "TEK", "LEP", "MMP2", "EFNB2", "EDN1", "CXCL12", "FGF7", "FGFR1", "TGFB2", "TGFB3")
mat <- assay(vsdata)[ A, ]
tmat <- t(mat)
ztmat <- scale(tmat, center = TRUE, scale = TRUE)
tztmat <- t(ztmat)
library(pheatmap)
df <- as.data.frame(colData(vsdata)[,c("ID","Group")])
pheatmap(tztmat, annotation_col=df, cluster_rows=F, cluster_cols=F)
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
) +
geom_point() +
geom_smooth(method = "lm")
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point(mapping = aes(color = species)) +
geom_smooth(method = "lm")
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point(mapping = aes(color = species, shape = species)) +
geom_smooth(method = "lm")
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point(aes(color = species, shape = species)) +
geom_smooth(method = "lm") +
labs(
title = "Body mass and flipper length",
subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
x = "Flipper length (mm)", y = "Body mass (g)",
color = "Species", shape = "Species"
) +
ggplot(penguins, aes(x = fct_infreq(species))) +
geom_bar()
ggplot(penguins, aes(x = body_mass_g)) +
geom_histogram(binwidth = 200)
ggplot(penguins, aes(x = body_mass_g, color = species)) +
geom_density(linewidth = 0.75)
ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) +
geom_density(alpha = 0.5)
ggplot(penguins, aes(x = island, fill = species))+
geom_bar()
ggplot(penguins, aes(x = island, fill =species)) +
geom_bar(position = "fill")
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(aes(color = species, shape = island))
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(aes(color = species, shape = species)) +
facet_wrap(~island)
ggsave(filename = "penguin-plot.png")
https://r4ds.hadley.nz/layers.html
library("RColorBrewer")
display.brewer.all()
library("RColorBrewer")
> display.brewer.all()
> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlBu")))(100))
> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="Spectral")))(100))
> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlGn")))(100))
> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="Set1")))(100))
>
pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(c("blue", "pink", "red"))(50))
Correlations were plotted using the corrplot package
pheatmap(test, color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
if (!require(devtools)) install.packages("devtools")
devtools::install_github("yanlinlin82/ggvenn")
a <- list('A - R PvsPre, down' = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27),
'A - R PvsPre, up' = c(28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54),
'A - NR PvsPre, up' = c(28, 1, 29, 4, 3, 34, 6, 36, 37, 38, 11, 42, 43, 44, 15, 45, 16, 17, 18, 22, 48, 49, 51, 52, 26),
'A - NR PvsPre, down' = c(30, 31, 2, 3, 32, 5, 35, 7, 8, 9, 10, 12, 39, 40, 13, 41, 14, 46, 47, 19, 20, 21, 23, 24, 50, 25, 53, 54, 27))
ggvenn(a)
dmm <- dd1 %>% group_by(Gene.signature) %>% summarise_if(is.numeric, mean, na.rm = TRUE)
HNSC2 <- HNSC1[ , grepl( "01A" , names(HNSC1) ) ]
# this gives log2(n + 1)
ntd <- normTransform(dds)
library("vsn")
meanSdPlot(assay(ntd))
Remove columns from dataframe where ALL values are NA
x y z
1 1 1 NA
2 2 2 NA
3 3 NA NA
4 4 4 NA
5 5 5 NA
!apply (is.na(df), 2, all)
var1 var2 var3
TRUE TRUE FALSE
> df[, !apply(is.na(df), 2, all)]
var1 var2
1 1 1
2 2 2
3 3 1
4 4 3
5 5 4
6 6 NA
7 7 NA
8 NA 9
3. Row which contains all column values that are missing
Suppose if you want to remove all column values contains NA then following codes will be handy.
Method 1:Using is.na(), rowSums() & ncol() Functions
df=data.frame(Col1=c(NA,"B","C","D",
"P1","P2","P3")
,Col2=c(NA,8,NA,9,10,8,9)
,Col3=c(NA,7,6,8,NA,7,8)
,Col4=c(NA,NA,7,7,NA,7,7))
df[rowSums(is.na(df)) != ncol(df), ]
Col1 Col2 Col3 Col4
2 B 8 7 NA
3 C NA 6 7
4 D 9 8 7
5 P1 10 NA NA
6 P2 8 7 7
7 P3 9 8 7
Method 2:Using Using filter() Function
df=data.frame(Col1=c(NA,"B","C","D",
"P1","P2","P3")
,Col2=c(NA,8,NA,9,10,8,9)
,Col3=c(NA,7,6,8,NA,7,8)
,Col4=c(NA,NA,7,7,NA,7,7))
library("dplyr")
filter(df, rowSums(is.na(df)) != ncol(df))
Col1 Col2 Col3 Col4
1 B 8 7 NA
2 C NA 6 7
3 D 9 8 7
4 P1 10 NA NA
5 P2 8 7 7
6 P3 9 8 7
Replace NA with 0
SurDRRA[is.na(SurDRRA)] <- 0
b <- list(`MacKO:HetC_DIET,down` = 9:33, `MacKO:HetC_DIET,up` = 1:8, `KO:WT_DIET,up` = c(1:9, 13, 14, 18, 28:30, 32, 34:42), `KO:WT_DIET,down` = c(21, 23))
ggvenn(b, set_name_size = 3, text_size = 4, set_name_color = c("blue", "yellow4", "green4", "red")) + coord_cartesian(xlim = c(-3, 3), ylim = c(-3, 3))
ggvenn(a, set_name_size = 3, set_name_color = c("blue", "yellow4", "green4", "red"), text_size = 4, stroke_size = 0.5) + coord_cartesian(xlim = c(-3, 3), ylim = c(-3, 3)) + labs(title = "Stroma hallmark pathway")
ggvenn(a, fill_color = c("#F8766D", "#00BFC4"), text_size = 6, set_name_color = c("#F8766D", "#00BFC4"), set_name_size = 8)
heatmap
d <- data.table::fread("C:/Users/?/Desktop/D +vs-R TCIA.txt.gz", data.table = F)
data <- d %>% column_to_rownames("celltype")
ddd <- as.matrix(data)
heatmap(ddd, cexRow = 1, cexCol = 1, labRow = d$celltype, margins = c(8, 10))
pheatmap(ddd)
data <- d %>% column_to_rownames("celltype")
ddd <- as.matrix(data)
pheatmap(ddd,cluster_cols = F,cluster_rows = F,main = "ADM safeTME",
color=colorRampPalette(c("navy", "white", "red"))(50),
cutree_cols=2)
pheatmap(ddd,cluster_cols = F,cluster_rows = F,main = "ADM safeTME",
color=colorRampPalette(c("#00BFC4", "white", "#F8766D"))(50),
cutree_cols=2)
Spatial Decon
temp = replace(restils$prop_of_nontumor, is.na(restils$prop_of_nontumor), 0)
o = hclust(dist(temp))$order
layout(mat = (matrix(c(1, 2), 1)), widths = c(7, 3))
TIL_barplot(ddd, draw_legend = TRUE, cex.names = 0.5)
d <- data.table::fread("C:/Users/?/Desktop/Til/DTil/DTilDM1difference.txt.gz", data.table = F)
data <- d %>% column_to_rownames("celltype")
ddd <- as.matrix(data)
temp = replace(restils$prop_of_nontumor, is.na(restils$prop_of_nontumor), 0)
o = hclust(dist(temp))$order
layout(mat = (matrix(c(1, 2), 1)), widths = c(7, 3))
TIL_barplot(ddd, draw_legend = TRUE, cex.names = 0.5, main = "D DM P vs Pre TME cell type difference")
To access any other kind of file with compression, simply use gzfile("") around the file name:
write.table(tst.df,gzfile("test.dat.gz")) # write a compressed file read.table(gzfile("test.dat.gz"),row.names=1)# read it back in
results <- cibersort(HNSC, q_Norm, perm = 100, QN=TRUE)
pheatmap
Error in cut.default(x, breaks = breaks, include.lowest = T) :
'x' must be numeric
D5 <- apply(as.matrix(d4), 2, as.numeric)
Labels <- c(rep("R", 6),rep("P", 5),rep("N", 5),rep("N_P", 5))
names(Labels) <- rownames(d5)
Pos_DM <- as.data.frame(Labels)
p1 <- one.step.pigengene(Data=d1,saveDir='pigengene', bnNum=0, verbose=1,
seed=1, Labels=Labels, toCompact=FALSE, doHeat=FALSE)
h1 <- pheatmap.type(d5[,1:20],annRow= Pos_DM,show_rownames=FALSE)
ggplot(b, aes(fill=Cell.state, z.scores, x=celltype), size = 12) +
geom_bar(position="stack", stat="identity") + + theme(axis.text.x = element_text(angle = 90, hjust = 1))
Cell State _Negative _Neg Cell type Abundance _Positive _Pos Cell type Abundance
S01 _Neg_B cells_NR_P 0.00150677 _ Pos_B cells_NR_P 0.171670673
S02 _Neg_B cells_NR_P 0.088171129 _ Pos_B cells_NR_P 0.088419299
S03 _Neg_B cells_NR_P 0.057669946 _ Pos_B cells_NR_P 0.107745723
S04 _Neg_B cells_NR_P 0.032971537 _ Pos_B cells_NR_P 0.072909367
S05 _Neg_B cells_NR_P 0.047689777 _ Pos_B cells_NR_P 0.133199207
S01 _Neg_B cells_NR_Pre 0.019111569 _ Pos_B cells_NR_Pre 0.22481643
S02 _Neg_B cells_NR_Pre 0.061897723 _ Pos_B cells_NR_Pre 0.0780146
S03 _Neg_B cells_NR_Pre 0.091447498 _ Pos_B cells_NR_Pre 0.05800381
S04 _Neg_B cells_NR_Pre 0.07043731 _ Pos_B cells_NR_Pre 0.091037166
S05 _Neg_B cells_NR_Pre 0.061042366 _ Pos_B cells_NR_Pre 0.131287177
S01 _Neg_B cells_R_P 4.90E-05 _ Pos_B cells_R_P 0.072245259
S02 _Neg_B cells_R_P 0.113785657 _ Pos_B cells_R_P 0.084129808
S03 _Neg_B cells_R_P 0.102182471 _ Pos_B cells_R_P 0.144895115
S04 _Neg_B cells_R_P 0.044155086 _ Pos_B cells_R_P 0.114542694
S05 _Neg_B cells_R_P 0.119914577 _ Pos_B cells_R_P 0.058223155
library(circlize)
x = rnorm(2600)
factors = sample(letters, 2600, replace = TRUE)
circos.initialize(factors = factors, x = x)
circos.trackHist(factors = factors, x = x, track.height = 0.1, col = "#999999", border = "#999999")
circos.trackHist(factors = factors, x = x, force.ylim = FALSE, bin.size = 0.1, track.height = 0.1, col = "#999999", border = "#999999")
circos.trackHist(factors = factors, x = x, draw.density = TRUE, track.height = 0.1, col = "#999999", border = "#999999")
circos.trackHist(factors = factors, x = x, draw.density = TRUE, force.ylim = FALSE, track.height = 0.1, col = "#999999", border = "#999999")
par(mfrow = c(1, 2))
circos.initialize(letters[1:4], xlim = c(0, 10))
circos.track(ylim = c(0, 1), panel.fun = function(x, y) {
value = runif(10)
circos.barplot(value, 1:10 - 0.5, col = 1:10)
})
circos.track(ylim = c(-1, 1), panel.fun = function(x, y) {
value = runif(10, min = -1, max = 1)
circos.barplot(value, 1:10 - 0.5, col = ifelse(value > 0, 2, 3))
})
circos.clear()
circos.initialize(letters[1:4], xlim = c(0, 10))
circos.track(ylim = c(0, 4), panel.fun = function(x, y) {
value = matrix(runif(10*4), ncol = 4)
circos.barplot(value, 1:10 - 0.5, col = 2:5)
})
https://jokergoo.github.io/circlize_book/book/circos-heatmap.html#circos-heatmap-input-data
https://bioinformatics.ccr.cancer.gov/docs/data-visualization-with-r/Lesson5_intro_to_ggplot/
#data import from excel
data<-readxl::read_xlsx("./data/RNASeq_totalcounts_vs_totaltrans.xlsx",
1,.name_repair = "universal")
Readxl’s default is .name_repair = "unique"
scatter_plot<-ggplot(data=data) +
geom_point(aes(x=Number.of.Transcripts, y = Total.Counts,
color=Treatment))
scatter_plot +
scale_color_manual(values=c("red","black"),
labels=c('treated','untreated'))
scatter_plot +
scale_color_grey()
To apply scale_fill
to shape, we will have to alter the shapes, as only some shapes take a fill argument.
Image from https://ggplot2.tidyverse.org/articles/ggplot2-specs.html:
ggplot(data=data) +
geom_point(aes(x=Number.of.Transcripts, y = Total.Counts,fill=Treatment),
shape=21,size=2) + #increase size and change points
scale_fill_manual(values=c("purple", "yellow"))
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
ggplot() + geom_col(data = Titanic, aes(x = Class, y = Freq, fill = Survived), position = "dodge") +
facet_grid(Sex ~ Age)
ggplot(data=data) +
geom_point(aes(x=Number.of.Transcripts, y = Total.Counts,
fill=Treatment),
shape=21,size=2) +
scale_fill_manual(values=c("purple", "yellow"),
labels=c('treated','untreated'))+
#can change labels of fill levels along with colors
xlab("Recovered transcripts per sample") + #add x label
ylab("Total sequences per sample") +#add y label
guides(fill = guide_legend(title="Treatment")) + #label the legend
scale_y_continuous(trans="log10") + #log transform the y axis
theme_bw()
ggsave("Plot1.png",width=5.5,height=3.5,units="in",dpi=300)
http://timer.comp-genomics.org/timer/
performed GSEA with MSigDB sets and used gseaplot2 to visualize the results:
display_geneset <- "IVANOVA_HEMATOPOIESIS_STEM_CELL_LONG_TERM"
p_msig <- gseaplot2(gse_cgp, geneSetID = display_geneset, title = gse_cgp$Description[gse_cgp$ID == display_geneset],
color = "green", base_size = 5) + theme( aspect.ratio = 1/1.618)
p_msig
ggsave("output/clusterProfiler_GSEA_IVANOVA_small.svg", width=6, units = "cm", scale = 1)
Example 1: S4 Class and Object in R
# create a class "Student_Info" with three member variables
setClass("Employee_Info", slots=list(name="character", age="numeric", role="character"))
# create an object of class
employee1 <- new("Employee_Info", name = "Peter", age = 21, role = "Developer")
# call employee1 object
employee1
https://rpubs.com/pranali018/enrichmentAnalysis
library(enrichplot)
library(ggplot2)
library("org.Hs.eg.db")
library(clusterProfiler)
library(pathview)
require(DOSE)
dotplot(gse, showCategory=3, split=".sign") + facet_grid(.~.sign)
#remove any rows where GeneSymbol is not present
num = which(is.na(res$GeneSymbol))
res = res[-num, ]
# we want the log2 fold change
original_gene_list = res$log2FoldChange
# name the vector
names(original_gene_list) <- res$GeneSymbol
# omit any NA values
gene_list<-na.omit(original_gene_list)
# sort the list in decreasing order (required for clusterProfiler)
gene_list = sort(gene_list, decreasing = TRUE)
gse <- gseGO(geneList=gene_list,
ont ="ALL",
keyType = "SYMBOL",
minGSSize = 3,
maxGSSize = 800,
pvalueCutoff = 0.05,
verbose = TRUE,
OrgDb = org.Hs.eg.db,
pAdjustMethod = "none")
out = as.matrix(gse@result)
dotplot(gse, showCategory=3, split=".sign") + facet_grid(.~.sign)
The dotplot shows the ontology terms which are either activated (up regulated) or suppresed(down regulated) in the dataset. The number of ontology terms per feature (BP, MF, CC) to be displayed is determined by the showCateory function. The color of the dots indicate the pvalues for each of the terms and the size of the dots depicts the number of genes overlapping in each feature
x2 = pairwise_termsim(gse)
emapplot(x2, showCategory = 20)
Enrichment map organizes enriched terms into a network with edges connecting overlapping gene sets. In this way, mutually overlapping gene sets are tend to cluster together, making it easy to identify functional modules.
# Use the `Gene Set` param for the index in the title, and as the value for geneSetId
gseaplot(gse, by = "all", title = gse$Description[1], geneSetID = 1)
Plot of the Running Enrichment Score (green line) for a gene set as the analysis walks down the ranked gene list, including the location of the maximum enrichment score (the red line). The black lines in the Running Enrichment Score show where the members of the gene set appear in the ranked list of genes, indicating the leading edge subset.
The Ranked list metric shows the value of the ranking metric (log2 fold change) as you move down the list of ranked genes. The ranking metric measures a gene’s correlation with a phenotype.
gseaplot(kk2, by = "all", title = kk2$Description[19], geneSetID = 19)
r1 = pathview(gene.data = kegg_gene_list, pathway.id = "hsa00190", species = kegg_organism)
gene_list = sort(ranks, decreasing = TRUE)
hallmark <- GSEA(
geneList = gene_list,
exponent = 1,
minGSSize = 10,
maxGSSize = 500,
eps = 1e-10,
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
gson = NULL,
TERM2GENE = hall,
TERM2NAME = NA,
verbose = TRUE,
seed = FALSE,
by = "fgsea")
gseaplot(kk2, by = "all", title = kk2$Description[1], geneSetID = 1)
DESeqAnalysis
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
install.packages(
pkgs = "DESeqAnalysis",
repos = c(
"https://r.acidgenomics.com",
BiocManager::repositories()
),
dependencies = TRUE
)
https://rdrr.io/github/acidgenomics/DESeqAnalysis/man/DESeqAnalysis.html
resultsNames(data)
name <- resultsNames(data)[[2L]]
results <- DESeq2::results(data, name = name)
class(results)
lfcShrink <- DESeq2::lfcShrink(dds = data, res = results, coef = 2L)
results <- list(results)
names(results) <- name
lfcShrink <- list(lfcShrink)
names(lfcShrink) <- name
object <- DESeqAnalysis(
data = data,
transform = transform,
results = results,
lfcShrink = lfcShrink
)
plotDegStackedBar(
object,
i = 1L,
direction = c("both", "up", "down"),
orderBySize = FALSE,
label = TRUE,
flip = TRUE
)
assay(object@transform)
assay(object@data)
results(object, i = 1L)
sampleData(object)
genes <- head(rownames(object), 50L)
head(genes)
samples <- head(colnames(object), 2L)
head(samples)
x <- object[genes, samples]
mai <- assay(vsdata)[HY, ]
mai <- as.data.frame(mai)
ma <- res %>% filter((GeneSymbol %in% HY) & pvalue < 0.05)
mai1 <- mai %>% mutate(rownames(mai))
names(mai1)[11] <- "GeneSymbol"
mai2 <- merge(mai1, ma)
mai3 <- mai2 %>% arrange(desc(TD23_2))
mai4 <- apply(as.matrix(mai3[, 1:11]), 2, as.numeric)
rownames(mai4) <- mai3$GeneSymbol
df <- as.data.frame(colData(vsdata)[, c("ID", "Group")])
pheatmap(mai4[,-1], annotation_col=df, cluster_rows=F, cluster_cols=F, main = "HALLMARK_HYPOXIA")
mai <- assay(vsdata)[IN, ]
mai <- as.data.frame(mai)
ma <- res %>% filter((GeneSymbol %in% IN) & pvalue < 0.05)
mai1 <- mai %>% mutate(rownames(mai))
names(mai1)[10] <- "GeneSymbol"
mai2 <- merge(mai1, ma)
mai3 <- mai2 %>% arrange(desc(TD21_2))
mai4 <- apply(as.matrix(mai3[, 1:10]), 2, as.numeric)
rownames(mai4) <- mai3$GeneSymbol
df <- as.data.frame(colData(vsdata)[, c("ID", "Group")])
pheatmap(mai4[,-1], annotation_col=df, cluster_rows=F, cluster_cols=F, main = " -NR HALLMARK_TNFA_SIGNALING_VIA_NFKB ")
mai <- assay(vsdata)[IN, ]
mai <- as.data.frame(mai)
ma <- res %>% filter((GeneSymbol %in% IN) & pvalue < 0.05)
mai1 <- mai %>% mutate(rownames(mai))
names(mai1)[24] <- "GeneSymbol"
mai2 <- merge(mai1, ma)
mai3 <- mai2 %>% arrange(desc(CK .R33.RM29P))
mai4 <- apply(as.matrix(mai3[, 1:24]), 2, as.numeric)
rownames(mai4) <- mai3$GeneSymbol
df <- as.data.frame(colData(vsdata)[, c("ID", " Treatment")])
pheatmap(mai4[,-1], annotation_col=df, cluster_rows=F, cluster_cols=F, main = " NFKB ")
gseaplot2(
gse,
geneSetID = 876,
title = gse$Description[876],
color = "green",
base_size = 11,
rel_heights = c(1.5, 0.5, 1),
subplots = 1:3,
pvalue_table = FALSE,
ES_geom = "line"
)
Point label
https://bookdown.org/ndphillips/YaRrr/low-level-plotting-functions.html
before <- c(2.1, 3.5, 1.8, 4.2, 2.4, 3.9, 2.1, 4.4)
after <- c(7.5, 5.1, 6.9, 3.6, 7.5, 5.2, 6.1, 7.3)
# Create plotting space and before scores
plot(x = rep(1, length(before)),
y = before,
xlim = c(.5, 2.5),
ylim = c(0, 11),
ylab = "Score",
xlab = "Time",
main = "Using segments() to connect points",
xaxt = "n")
# Add after scores
points(x = rep(2, length(after)), y = after)
# Add connections with segments()
segments(x0 = rep(1, length(before)),
y0 = before,
x1 = rep(2, length(after)),
y1 = after,
col = gray(0, .5))
# Add labels
mtext(text = c("Before", "After"),
side = 1, at = c(1, 2), line = 1)
Connecting points with segments().
Figure 11.10: Connecting points with segments().
plot(x = pirates$height,
y = pirates$weight,
pch = 16,
col = transparent("skyblue", .7),
main = "Adding a regression line to a scatterplot()")
# Add the regression line
abline(lm(weight ~ height, data = pirates),
lty = 2)
plot(x = height,
y = weight,
xlim = c(155, 180),
ylim = c(65, 80),
pch = 16,
col = yarrr::piratepal("xmen"))
# Add id labels
text(x = height,
y = weight,
labels = id,
pos = 3) # Put labels above the points
# Create the plot
plot(x = ChickWeight$Time,
y = ChickWeight$weight,
col = gray(.3, .5),
pch = 16,
main = "Combining text with numeric scalers using paste()")
# Add reference line
abline(h = mean(ChickWeight$weight),
lty = 2)
# Add text
text(x = 3,
y = mean(ChickWeight$weight),
labels = paste("Mean weight =",
round(mean(ChickWeight$weight), 2)),
pos = 3)
layout(mat = matrix(c(2, 1, 0, 3),
nrow = 2,
ncol = 2),
heights = c(1, 2), # Heights of the two rows
widths = c(2, 1)) # Widths of the two columns
# Plot 1: Scatterplot
par(mar = c(5, 4, 0, 0))
plot(x = pirates$height,
y = pirates$weight,
xlab = "height",
ylab = "weight",
pch = 16,
col = yarrr::piratepal("pony", trans = .7))
# Plot 2: Top (height) boxplot
par(mar = c(0, 4, 0, 0))
boxplot(pirates$height, xaxt = "n",
yaxt = "n", bty = "n", yaxt = "n",
col = "white", frame = FALSE, horizontal = TRUE)
# Plot 3: Right (weight) boxplot
par(mar = c(5, 0, 0, 0))
boxplot(pirates$weight, xaxt = "n",
yaxt = "n", bty = "n", yaxt = "n",
col = "white", frame = F)
cor.test(formula = ~ Zyx + Zzz3, data = b2)
Pearson's product-moment correlation
data: Zyx and Zzz3
t = 1.265, df = 15, p-value = 0.2252
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.2000037 0.6884001
sample estimates:
cor
0.3104886
d$ID[duplicated(d$ID)]
[1] "SLC35E2"
> which(duplicated(d$ID))
[1] 16273
> which(d$ID == "SLC35E2")
[1] 16272 16273
subset(x = ToothGrowth,
subset = len > 30 & supp == "VC",
select = c(len, dose))
Table 9.1: Functions for managing your workspace, working directory, and writing data from R as .txt or .RData files, and reading files into R
Code Description
ls() Display all objects in the current workspace
rm(a, b, ..) Removes the objects a, b… from your workspace
rm(list = ls()) Removes all objects in your workspace
getwd() Returns the current working directory
setwd(file = "dir) Changes the working directory to a specified file location
list.files() Returns the names of all files in the working directory
write.table(x, file = "mydata.txt", sep) writes the object x to a text file called mydata.txt. Define how the columns will be separated with sep (e.g.; sep = "," for a comma–separated file, and sep = t" for a tab–separated file).
save(a, b, .., file = "myimage.RData) Saves objects a, b, … to myimage.RData
save.image(file = "myimage.RData") Saves all objects in your workspace to myimage.RData
load(file = "myimage.RData") Loads objects in the file myimage.RData
read.table(file = "mydata.txt", sep, header) Reads a text file called mydata.txt, define how columns are separated with sep (e.g. sep = "," for comma-delimited files, and sep = "t" for tab-delimited files), and whether there is a header column with header = TRUE
save(a, b, c, file = "myobjects.RData"
)
save.image(file = "data/projectimage.RData")
study1.df <- data.frame(id = 1:5,
sex = c("m", "m", "f", "f", "m"),
score = c(51, 20, 67, 52, 42))
score.by.sex <- aggregate(score ~ sex,
FUN = mean,
data = study1.df)
study1.htest <- t.test(score ~ sex,
data = study1.df)
mydata <- read.table(file = 'data/mydata.txt', # file is in a data folder in my working directory
sep = 't', # file is tab--delimited
header = TRUE, # the first row of the data is a header row
stringsAsFactors = FALSE) # do NOT convert strings to factors!!
# Read a text file from the web
fromweb <- read.table(file = 'http://goo.gl/jTNf6P',
sep = 't',
header = TRUE)
pirates <- pirates[order(pirates$sex, pirates$height),]
combined.survey <- merge(x = risk.survey,
y = happiness.survey,
by = "participant",
all = TRUE)
hist(x = ChickWeight$weight[ChickWeight$Diet == 1],
main = "Two Histograms in one",
xlab = "Weight",
ylab = "Frequency",
breaks = 20,
xlim = c(0, 500),
col = gray(0, .5))
hist(x = ChickWeight$weight[ChickWeight$Diet == 2],
breaks = 30,
add = TRUE, # Add plot to previous one!
col = gray(1, .8))
barplot(height = 1:5, # A vector of heights
names.arg = c("G1", "G2", "G3", "G4", "G5"), # A vector of names
main = "Example Barplot",
xlab = "Group",
ylab = "Height")
swim.data
## Naked Clothed
## No Shark 2.1 1.5
## Shark 3.0 3.0
Now, when I enter this matrix as the height = swim.data argument to barplot(), I’ll get multiple bars.
barplot(height = swim.data,
beside = TRUE, # Put the bars next to each other
legend.text = TRUE, # Add a legend
col = c(transparent("green", .2),
transparent("red", .2)),
main = "Swimming Speed Experiment",
ylab = "Speed (in meters / second)",
xlab = "Clothing Condition",
ylim = c(0, 4))
https://cran.r-project.org/web/packages/pals/vignettes/bivariate_choropleths.html
https://rfunctions.blogspot.com/2015/03/bivariate-maps-bivariatemap-function.html
WGCNA
https://fuzzyatelin.github.io/bioanth-stats/module-F21-Group1/module-F21-Group1.html
d <- dat1 %>% separate_wider_delim(`X|Y`, delim = "|", names = c("X", "Y"))
https://rdrr.io/cran/DGEobj.utils/man/convertCounts.html
https://alexslemonade.github.io/refinebio-examples/03-rnaseq/pathway-analysis_rnaseq_03_gsva.html
library(vtable)
dd <- st(dat, group = 'Condition', group.test = TRUE)
dat1 <- dat[, 2:52] %>%
gather(key = variable, value = value, - Condition) %>%
group_by(Condition, variable) %>%
summarise(value = list(value)) %>%
spread(Condition, value) %>%
group_by(variable) %>%
mutate(p_value = t.test(unlist(TT), unlist(NT))$p.value,
t_value = t.test(unlist(TT), unlist(NT))$statistic)
library(annotables)
grch38 %>% filter(ensgene %in% )
ANGIOGENESIS_lm <- lm(HALLMARK_ANGIOGENESIS ~ MFP + _status, data = HALL_clinic_TME)
ANGIOGENESIS.II.aov <- car::Anova(ANGIOGENESIS_lm, type = 2)
Correlation
https://statsandr.com/blog/correlation-coefficient-and-correlation-test-in-r/
library("ggpubr")
ggscatter(CE9F, x = "CE10", y = "OS",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "CE10_F", ylab = "OS")
cor.test(formula = ~ CE9 + OS,
data = Pos_CE1,
subset = MFP == "D")
t.test(formula = CE10 ~ MFP,
data = CE9IE_F1)
CE1 CE2 CE3 CE4
TCGA-BA-5152 0.15368566 0.276724867 0.19438770 0.044953297
TCGA-BA-5153 0.09342664 0.046838160 0.02334477 0.006748026
TCGA-BA-5559 0.07732019 0.006907233 0.07898090 0.252107417
round(cor(dat), 2)
CE1 CE2 CE3 CE4 CE5 CE6 CE7 CE8 CE9 CE10 OS
CE1 1.00 0.12 0.20 0.07 -0.35 0.15 -0.26 -0.60 -0.26 -0.30 -0.08
CE2 0.12 1.00 0.03 -0.10 -0.34 -0.38 0.13 -0.14 -0.34 -0.52 -0.08
CE3 0.20 0.03 1.00 0.02 -0.35 -0.03 -0.35 -0.57 0.28 -0.01 -0.20
CE4 0.07 -0.10 0.02 1.00 -0.06 0.32 -0.42 -0.21 -0.27 -0.25 -0.07
CE5 -0.35 -0.34 -0.35 -0.06 1.00 0.10 -0.05 0.57 -0.19 -0.09 -0.14
CE6 0.15 -0.38 -0.03 0.32 0.10 1.00 -0.23 -0.16 -0.31 0.00 0.00
CE7 -0.26 0.13 -0.35 -0.42 -0.05 -0.23 1.00 0.48 -0.15 -0.17 0.10
CE8 -0.60 -0.14 -0.57 -0.21 0.57 -0.16 0.48 1.00 -0.19 -0.10 0.12
CE9 -0.26 -0.34 0.28 -0.27 -0.19 -0.31 -0.15 -0.19 1.00 0.54 0.07
CE10 -0.30 -0.52 -0.01 -0.25 -0.09 0.00 -0.17 -0.10 0.54 1.00 0.21
OS -0.08 -0.08 -0.20 -0.07 -0.14 0.00 0.10 0.12 0.07 0.21 1.00
ggplot(dat) +
aes(x = CE1, y = CE2) +
geom_point(colour = "#0c4c8a") +
theme_minimal()
pairs(dat[, c("CE9", "CE10", "OS")])
library(corrplot)
corrplot(cor(dat),
method = "number",
type = "upper" # show only upper side
)
correlation tests for whole dataset
library(Hmisc)
res <- rcorr(as.matrix(dat)) # rcorr() accepts matrices only
# display p-values (rounded to 3 decimals)
round(res$P, 3)
library(ggstatsplot)
ggscatterstats(
data = dat,
x = CE1,
y = CE2,
bf.message = FALSE,
marginal = FALSE # remove histograms
)
library(correlation)
correlation::correlation(dat,
include_factors = TRUE, method = "auto"
)