longteng

随笔记录, 工作日记.

首页文章列表博文目录

个人资料

longteng24

给我悄悄话

博客访问：

storehouse1

(2024-11-22 19:09:54) 下一个

cat IN1_S1_L001_R1_001.fastq IN1_S1_L002_R1_001.fastq > IN1_S1_concatenated_R1_001.fastq

cat IN1_S1_L001_R2_001.fastq IN1_S1_L002_R2_001.fastq > IN1_S1_concatenated_R2_001.fastq

https://bioinformatics-core-shared-training.github.io/RNAseq-R/align-and-count.nb.html

library(readr)

library(dplyr)

library(ggplot2)

library(tidyverse)

library(DESeq2)

mycounts <- read.csv("C:/Users/?/Desktop/mycounts.csv")

metadata <- read.csv("C:/Users/?/Desktop/metadata.csv")

#countData <- mycounts[!(duplicated(mycounts[[1]]) | duplicated(mycounts[[1]], fromLast=TRUE)), ]

table(duplicated(mycounts$symbol))

mycounts<- mycounts[duplicated(mycounts$symbol) == FALSE,]

#rownames(mycounts) = make.names(mycounts$gene, unique=TRUE)

mycounts <- as.data.frame(mycounts)

metadata <- as.data.frame(metadata)

head(mycounts)

head(metadata)

class(mycounts)

class(metadata)

dds <- DESeqDataSetFromMatrix(countData=mycounts,

colData=metadata,

design=~Group,

tidy=TRUE)

dds

keep <- rowSums(counts(dds)) >= 10

dds <- dds[keep,]

#smallestGroupSize <- 3

#keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize

#dds <- dds[keep,]

#keep <- rowSums(counts(dds) >= 15) >= 34 #remove all genes with counts<15 in more than 75% samples(45*0.75=33.75)

#dds <- dds[keep,]

sizeFactors(dds)

dispersions(dds)

dds <- DESeq(dds)

res <- results(dds, tidy=TRUE)

res = results(dds, contrast=c("Group", "Treat", "CNL"), tidy=TRUE)

res <- as_tibble(res)

res

res %>%

filter(padj<0.05) %>%

write_csv("sigresults.csv")

vsdata <- vst(dds, blind=FALSE)

plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata))) + labs(title=" PCA")

df <- as.data.frame(colData(vsdata)[,c("Treat","Group","D","R"," ")])

H <- c("P53","MYC")

mat <- assay(vsdata)[ H, ]

mat <- as.data.frame(mat)

mat1 <- mat %>% arrange(desc(T20))

pheatmap(mat1, annotation_col=df, cluster_rows=F, cluster_cols=F,main = "heatmap")

plotCounts(dds, gene="EN", intgroup="Group", returnData=TRUE) %>%

ggplot(aes(Group, count)) + geom_boxplot(aes(fill=Group)) + scale_y_log10() + ggtitle("")

vsdata <- varianceStabilizingTransformation(dds) (less row)

#Volcano

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point()

# Add more simple "theme"

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point() + theme_minimal()

# Add vertical lines for log2FoldChange thresholds, and one horizontal line for the p-value threshold

p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +

geom_hline(yintercept=-log10(0.05), col="red")

# The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners.

# Add a column to the data frame to specify if they are UP- or DOWN- regulated (log2FoldChange respectively positive or negative)

# add a column of NAs

res$diffexpressed <- "NO"

# if log2Foldchange > 0.6 and pvalue < 0.05, set as "UP"

res$diffexpressed[res$log2FoldChange > 0.6 & res$pvalue < 0.05] <- "UP"

# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"

res$diffexpressed[res$log2FoldChange < -0.6 & res$pvalue < 0.05] <- "DOWN"

# Re-plot but this time color the points with "diffexpressed"

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed)) + geom_point() + theme_minimal()

# Add lines as before...

p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +

geom_hline(yintercept=-log10(0.05), col="red")

## Change point color

# 1. by resfault, it is assigned to the categories in an alphabetical orresr):

p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))

# 2. to automate a bit: ceate a named vector: the values are the colors to be used, the names are the categories they will be assigned to:

mycolors <- c("blue", "red", "black")

names(mycolors) <- c("DOWN", "UP", "NO")

p3 <- p2 + scale_colour_manual(values = mycolors)

# Now write down the name of genes besires the points...

# Create a new column "reslabel" to res, that will contain the name of genes differentially expressed (NA in case they are not)

res$delabel <- NA

res$delabel[res$diffexpressed != "NO"] <- res$row[res$diffexpressed != "NO"]

ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +

geom_point() +

theme_minimal() +

geom_text()

library(ggrepel)

# plot adding up all layers we have seen so far

ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +

geom_point() +

theme_minimal() +

geom_text_repel() +

scale_color_manual(values=c("blue", "black", "red")) +

geom_vline(xintercept=c(-0.6, 0.6), col="red") +

geom_hline(yintercept=-log10(0.05), col="red")

plotCounts(dds, gene="", intgroup="Group", returnData=TRUE) %>% ggplot(aes(Group, count)) + geom_boxplot(aes(fill=roup)) + scale_y_log10() + ggtitle("") + scale_x_discrete(name ="Group", limits=c("CNL","miR"))

plotCounts(dds, gene="BTF3", intgroup="Morphologic.Classification", returnData=TRUE) %>% ggplot(aes(Morphologic.Classification, count)) + geom_boxplot(aes(fill=Morphologic.Classification)) + scale_y_log10() + ggtitle(" ") + theme(plot.title = element_text(size = 30)) + scale_x_discrete(name ="Morphologic.Classification", limits=c("m0 ","m1","m2","m3","m4","m5"))

https://www.data-to-viz.com/caveat/boxplot.html

ggplot(data = dat, aes(x=Treatment, y=cell., fill=Treatment)) + scale_x_discrete(name ="Treatment", limits=c("UT","B","A","B+A")) + geom_violin(width=0.8) + geom_boxplot(width=0.8, color="black", alpha=0.8) + theme( legend.position="none", plot.title = element_text(size=15)) + ggtitle("Treatment") + theme(axis.text = element_text(size=12)) + theme(panel.background = element_rect(fill = "white", colour = "grey50"))

library("genefilter")

vsdata <- vst(dds, blind=FALSE)

sum( rowMeans( counts(dds, normalized=TRUE)) > 5 )

plotPCA(vsdata, intgroup="Group")

plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata)))

topVarGenes <- head(order(-rowVars(assay(vsdata))),50)

mat <- assay(vsdata)[ topVarGenes, ]

mat <- mat - rowMeans(mat)

df <- as.data.frame(colData(vsdata)[,c("ID","C","B")])

pheatmap(mat, annotation_col=df, cluster_rows=FALSE, cluster_cols=FALSE)

res1 <- res %>% filter(padj<0.05) %>% arrange(padj)

res2 <- res1[1:100,] %>% arrange(desc(log2FoldChange))

Sig <- res2$row

mat <- assay(vsdata)[ Sig, ]

mat <- mat - rowMeans(mat)

df <- as.data.frame(colData(vsdata)[,c("ID","Group")])

pheatmap(mat, annotation_col=df, cluster_rows=F, cluster_cols=F)

zmat <- scale(tmat1, center = TRUE, scale = TRUE) # upon column

sacPlsda <- opls(vsdata, "Treatment", orthoI = 1)

PLS-DA

library(org.Hs.eg.db)

ens2symbol <- AnnotationDbi::select(org.Hs.eg.db,

key=res$row,

columns="SYMBOL",

keytype="SYMBOL")

ens2symbol <- as_tibble(ens2symbol)

ens2symbol

res2 <- res %>%

dplyr::select(SYMBOL, stat) %>%

na.omit() %>%

distinct() %>%

group_by(SYMBOL) %>%

summarize(stat=mean(stat))

res2

library(fgsea)

ranks <- deframe(res2)

head(ranks, 20)

pathways.hallmark <- gmtPathways("C:/Users/?/Desktop/GSEA dataset/h.all.v2023.1.Hs.symbols.gmt")

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

fgseaResTidy <- fgseaRes %>%

as_tibble() %>%

arrange(desc(NES))

# Show in a nice table:

hallmark <- fgseaResTidy %>%

dplyr::select(-leadingEdge, -ES, -nMoreExtreme) %>%

arrange(padj)

hallmark <- as.matrix(hallmark)

write.csv(hallmark, " -NRhallmark.csv")

pathways.hallmark <- gmtPathways("C:/Users/?/Desktop/GSEA dataset/ h.all.v2023.1.Hs.symbols.gmt")

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, eps = 0.0,

minSize = 15,

maxSize = 500)

fgseaResTidy <- fgseaRes %>%

as_tibble() %>%

arrange(desc(NES))

# Show in a nice table:

fgseaResTidy %>%

dplyr::select(-leadingEdge, -ES, -nMoreExtreme) %>%

arrange(padj) %>%

DT::datatable()

ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +

geom_col(aes(fill=padj<0.05)) +

coord_flip() +

labs(x="Pathway", y="Normalized Enrichment Score",

title="Hallmark pathways NES from GSEA") +

theme_minimal()

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

go <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>% as_tibble()

Re <- fgsea(pathways=gmtPathways("C:/Users/?/Downloads/c2.cp.reactome.v2023.1.Hs.symbols.gmt"), ranks, nperm=1000) %>%

as_tibble()

go <- as.matrix(go)

write.csv(go, "go.csv")

Re <- as.matrix(Re)

write.csv(Re, "Re.csv")

hallmark <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/h.all.v7.2.symbols.gmt "), ranks, nperm=1000) %>% as_tibble()

hallmark <- as.matrix(hallmark)

write.csv(hallmark, "hallmark B PRE + R vs NR.csv")

plotEnrichment(pathways.hallmark[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]], ranks) + labs(title="HALLMARK_INTERFERON_GAMMA_RESPONSE")

plotEnrichment(pathways.hallmark[["HALLMARK_APOPTOSIS"]], ranks) + labs(title=" HALLMARK_APOPTOSIS")

pathways.immunmark <-gmtPathways("C:/Users/?/Dowloads/c7.all.v2022.1.Hs.symbols.gmt")

cell type signature gene sets

pathways.cellmark <- gmtPathways("C:/Users/?/Downloads/c8.all.v2022.1.Hs.symbols.gmt")

ggplot(Hal, aes(NES, NAME))+ geom_col(aes(fill=NES<0)) + scale_y_discrete(name ="NAME", limits=c("MITOTIC_SPINDLE", "IL6_JAK_STAT3_SIGNALING", "EPITHELIAL_MESENCHYMAL_TRANSITION")) + labs(x="Pathway", y="Normalized Enrichment Score", title="Hallmark pathways NES from GSEA") + theme_minimal()

	pathway	pval	padj	NES	size
1	HALLMARK_E2F_TARGETS	0.003125	0.016	1.50255883270908	196
2	HALLMARK_P53_PATHWAY	0.00307692307692308	0.016	1.46976609146818	193

ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +

geom_col(aes(fill=padj<0.05)) +

coord_flip() +

labs(x="Pathway", y="Normalized Enrichment Score",

title="Hallmark pathways NES from GSEA") +

theme_minimal()

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

go <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>% as_tibble()

Re <- fgsea(pathways=gmtPathways("C:/Users/?/Downloads/c2.cp.reactome.v2023.1.Hs.symbols.gmt"), ranks, nperm=1000) %>%

as_tibble()

go <- as.matrix(go)

write.csv(go, "go.csv")

Re <- as.matrix(Re)

write.csv(Re, "Re.csv")

hallmark <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/h.all.v7.2.symbols.gmt "), ranks, nperm=1000) %>% as_tibble()

hallmark <- as.matrix(hallmark)

write.csv(hallmark, "hallmark B PRE + R vs NR.csv")

plotEnrichment(pathways.hallmark[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]],

ranks) + labs(title="HALLMARK_INTERFERON_GAMMA_RESPONSE")

plotEnrichment(pathways.hallmark[["HALLMARK_APOPTOSIS"]],

ranks) + labs(title=" HALLMARK_APOPTOSIS")

pathways.immunmark <- gmtPathways("C:/Users/?/Dowloads/c7.all.v2022.1.Hs.symbols.gmt")

cell type signature gene sets

pathways.cellmark <- gmtPathways("C:/Users/?/Downloads/c8.all.v2022.1.Hs.symbols.gmt")

ggplot(Hal, aes(NES, NAME))+ geom_col(aes(fill=NES<0)) + scale_y_discrete(name ="NAME", limits=c("MITOTIC_SPINDLE", "IL6_JAK_STAT3_SIGNALING", "EPITHELIAL_MESENCHYMAL_TRANSITION", "APICAL_SURFACE", "COAGULATION", "ANDROGEN_RESPONSE", "UV_RESPONSE_UP", "HEDGEHOG_SIGNALING", "ADIPOGENESIS", "APOPTOSIS", "APICAL_JUNCTION", "INTERFERON_ALPHA_RESPONSE", "XENOBIOTIC_METABOLISM", "KRAS_SIGNALING_DN", "IL2_STAT5_SIGNALING", "HEME_METABOLISM", "UV_RESPONSE_DN", "INFLAMMATORY_RESPONSE", "MYOGENESIS", "ESTROGEN_RESPONSE_LATE", "KRAS_SIGNALING_UP", "PEROXISOME", "ESTROGEN_RESPONSE_EARLY", "INTERFERON_GAMMA_RESPONSE", "ALLOGRAFT_REJECTION", "GLYCOLYSIS", "HYPOXIA")) +

labs(x="Pathway", y="Normalized Enrichment Score",

title="Hallmark pathways NES from GSEA") +

theme_minimal()

212 HAY_BONE_MARROW_CD8_T_CELL 0.001317523 0.006216336 -0.548956711 -1.973916357 0 71 c("PARP8", "PATL2", "TTC16", "PTPRC", "MIAT", "KLRG1", "KIF19", "GBP5", "FYN", "LINC00987", "TRGC2", "NR4A2", "ATP8A1", "TRG-AS1", "RASAL3", "PPP2R5C", "PTPN22", "DUSP2", "CD8B", "HERPUD2", "PTPN7", "SLAMF6", "F2R", "SLA", "SH2D1A", "GZMK", "CD8A", "SYNE2", "ATG2A", "GZMH", "A2M-AS1", "SON", "CXCR4", "ZNF683", "LINC01871", "HLA-B", "JMJD6")

=CHAR(34)&A1&CHAR(34)

mat <- assay(vsdata)[ T, ]

mat <- mat - rowMeans(mat)

df <- as.data.frame(colData(vsdata)[,c("ID","Response.Primary","LN.Responder","Responder"," ","Group","Treatment","T.Stage", "N.Stage")])

pheatmap(mat, annotation_col=df, cluster_rows=F, cluster_cols=F)

Removing NA or NaN values

There are two ways to remove missing values:

Extracting values except for NA or NaN values:

Example 1:

· R

x <- c(1, 2, NA, 3, NA, 4)

d <- is.na(x)

x[! d]

Output:

[1] 1 2 3 4

Example 2:

· R

x <- c(1, 2, 0 / 0, 3, NA, 4, 0 / 0)

x[! is.na(x)]

Output:

[1] 1 2 NaN 3 NA 4 NaN

[1] 1 2 3 4

d <- filter(mycounts, X %in% c("ABCB6", "ADORA2B", "AGL"))

xcell

Install

devtools::install_github('dviraran/xCell')

Note: if the installation fails, try setting options(unzip = "internal") before calling the install function (thanks @hermidalc).

Usage

library(xCell)

exprMatrix = read.table(expr_file,header=TRUE,row.names=1, as.is=TRUE)

xCellAnalysis(exprMatrix)

This function performs all three steps in xCell, which can be performed seperately as well:

rawEnrichmentAnalysis
transformScores
spillOver

exprMatrix = read.table(expr_file,header=TRUE,row.names=1, as.is=TRUE)

exprMatrix = read.table(file='C:\Users\?\Desktop\10-18-22HNSC \Xcell.txt', header=TRUE, row.names=1, as.is=TRUE)

https://github.com/dviraran/xCell#install

The best thing to do might actually be to perform GSEA both ways (log2FC and signed pvalue), then load both sets of results into EnrichmentMap (https://apps.cytoscape.org/apps/enrichmentmap), and identify sets selected as significant by both metrics. EnrichmentMap supports a mode where two datasets from GSEA can be loaded in simultaneously and co-displayed. That would probably be the most rigorous way to do it.

34 HALLMARK_OXIDATIVE_PHOSPHORYLATION 0.006535948 0.016263336 -0.494556993 -2.378736178 0 200 c("COX17", "ATP5PO", "MRPL35", "COX6C", "VDAC3", "NDUFA4", "TIMM8B", "MDH1", "ATP5MC1", "COX7A2", "NDUFA2", "SUCLA2", "ATP5PF", "NDUFS4", "PRDX3", "UQCR11", "MRPL15", "ECHS1", "NDUFA8", "MRPS11", "DLD", "NDUFB8", "SLC25A4", "NDUFB6", "NDUFA5", "NDUFA6", "NDUFC2", "ATP5F1C", "COX7B", "CYB5A", "POLR2F", "NDUFB7", "MRPL11", "NDUFB1", "ATP5ME", "NDUFAB1", "NDUFC1", "ATP5PD", "NDUFB3", "BDH2", "NDUFB4", "ACADM", "TIMM10", "UQCRB", "NDUFS3", "SDHD", "SURF1", "TOMM70", "ATP6V0B", "COX15", "MDH2", "CYCS",

9 HALLMARK_G2M_CHECKPOINT 0.006451613 0.016263336 -0.320157329 -1.535067914 0 196 c("MEIS2", "MEIS1", "MAD2L1", "CKS2", "CENPA", "RAD21", "CCNA2", "HMMR", "FBXO5", "BRCA2", "SRSF1", "EGF", "KPNA2", "AURKA", "PURA", "CDK1", "XPO1", "EZH2", "MARCKS", "NDC80", "TFDP1", "ESPL1", "ORC5", "CCNB2", "KIF23", "SNRPD1", "ORC6", "BIRC5", "CDC6", "PBK", "TOP2A", "SQLE", "MKI67", "DBF4", "TTK", "CUL4A", "CKS1B", "DTYMK", "CDC27", "CENPE", "HSPA8", "CDC45", "PTTG1", "CDC20", "KIF2C", "CBX1", "HOXC10", "BUB3", "RAD54L", "NEK2", "KNL1", "SRSF10", "BUB1", "SMC2", "CDKN3", "UBE2C", "PLK1", "RASAL2", GINS2, "AURKB", "SLC12A2", "HIRA", "INCENP", "CDC7")

An example of a rank metric is the product of the sign of the ‘direction’ in the expression change (i.e. ‘up-regulation’ is positive and ‘down-regulation’ is negative) and the p-value (P).

si=s(Pi)=sign(fold change gene i)⋅−log10(Pi)

Under this rank metric, up-regulated genes with relatively small p-values appear at the top of the list and down-regulated genes with small p-values at the bottom.

dds2 <- DESeqDataSetFromMatrix(countData=mycounts,

colData=metadata,

design=~Group,

tidy=TRUE)

fpm2 <- fpm(dds2)

pheatmap

dat <- read.csv("C:/Users/?/Desktop/xcell/B .csv")

dat[is.na(dat)] = 0

dat1 <- as.matrix(dat[, 3:14])

rownames(dat1) <- dat$ID

dat2 <- scale(dat1)

pheatmap(dat2,cluster_cols = F,cluster_rows = F,main = "")

annot_row <- data.frame(row.names = data$ID, Group = data$Group)

pheatmap(data2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F,main = "B P + R vs NR")

A <- c("KDR", "JAG1", "COL3A1", "COL5A2", "PDGFB", "CD34", "LAMA2", "CHI3L1", "ITGAV", "HSPG2", "PECAM1", "NID1", "TIMP1", "ANGPT2", "CCL2", "IGF1", "TEK", "LEP", "MMP2", "EFNB2", "EDN1", "CXCL12", "FGF7", "FGFR1", "TGFB2", "TGFB3")

mat <- assay(vsdata)[ A, ]

tmat <- t(mat)

ztmat <- scale(tmat, center = TRUE, scale = TRUE)

tztmat <- t(ztmat)

library(pheatmap)

df <- as.data.frame(colData(vsdata)[,c("ID","Group")])

pheatmap(tztmat, annotation_col=df, cluster_rows=F, cluster_cols=F)

Join R Functions of dplyr Package Overview ggplot(

data = penguins,

mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)

) +

geom_point() +

geom_smooth(method = "lm")

ggplot(

data = penguins,

mapping = aes(x = flipper_length_mm, y = body_mass_g)

) +

geom_point(mapping = aes(color = species)) +

geom_smooth(method = "lm")

ggplot(

data = penguins,

mapping = aes(x = flipper_length_mm, y = body_mass_g)

) +

geom_point(mapping = aes(color = species, shape = species)) +

geom_smooth(method = "lm")

ggplot(

data = penguins,

mapping = aes(x = flipper_length_mm, y = body_mass_g)

) +

geom_point(aes(color = species, shape = species)) +

geom_smooth(method = "lm") +

labs(

title = "Body mass and flipper length",

subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",

x = "Flipper length (mm)", y = "Body mass (g)",

color = "Species", shape = "Species"

) +

scale_color_colorblind()

ggplot(penguins, aes(x = fct_infreq(species))) +

geom_bar()

A bar chart of frequencies of species of penguins, where the bars are ordered in decreasing order of their heights (frequencies): Adelie (approximately 150), Gentoo (approximately 125), Chinstrap (approximately 90).

ggplot(penguins, aes(x = body_mass_g)) +

geom_histogram(binwidth = 200)

ggplot(penguins, aes(x = body_mass_g, color = species)) +

geom_density(linewidth = 0.75)

ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) +

geom_density(alpha = 0.5)

A density plot of body masses of penguins by species of penguins. Each species (Adelie, Chinstrap, and Gentoo) is represented in different colored outlines for the density curves. The density curves are also filled with the same colors, with some transparency added. ggplot(penguins, aes(x = island, fill = species))+

geom_bar()

Bar plots of penguin species by island (Biscoe, Dream, and Torgersen) ggplot(penguins, aes(x = island, fill =species)) +

geom_bar(position = "fill")

Bar plots of penguin species by island (Biscoe, Dream, and Torgersen) the bars are scaled to the same height, making it a relative frequencies plot

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +

geom_point(aes(color = species, shape = island))

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +

geom_point(aes(color = species, shape = species)) +

facet_wrap(~island)

ggsave(filename = "penguin-plot.png")

https://r4ds.hadley.nz/layers.html

library("RColorBrewer")

display.brewer.all()

library("RColorBrewer")

> display.brewer.all()

> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlBu")))(100))

> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="Spectral")))(100))

> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlGn")))(100))

> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="Set1")))(100))

pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(c("blue", "pink", "red"))(50))

Correlations were plotted using the corrplot package

pheatmap(test, color = colorRampPalette(c("navy", "white", "firebrick3"))(50))

if (!require(devtools)) install.packages("devtools")

devtools::install_github("yanlinlin82/ggvenn")

a <- list('A - R PvsPre, down' = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27),

'A - R PvsPre, up' = c(28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54),

'A - NR PvsPre, up' = c(28, 1, 29, 4, 3, 34, 6, 36, 37, 38, 11, 42, 43, 44, 15, 45, 16, 17, 18, 22, 48, 49, 51, 52, 26),

'A - NR PvsPre, down' = c(30, 31, 2, 3, 32, 5, 35, 7, 8, 9, 10, 12, 39, 40, 13, 41, 14, 46, 47, 19, 20, 21, 23, 24, 50, 25, 53, 54, 27))

ggvenn(a)

dmm <- dd1 %>% group_by(Gene.signature) %>% summarise_if(is.numeric, mean, na.rm = TRUE)

HNSC2 <- HNSC1[ , grepl( "01A" , names(HNSC1) ) ]

# this gives log2(n + 1)

ntd <- normTransform(dds)

library("vsn")

meanSdPlot(assay(ntd))

Remove columns from dataframe where ALL values are NA

x y z

1 1 1 NA

2 2 2 NA

3 3 NA NA

4 4 4 NA

5 5 5 NA

!apply (is.na(df), 2, all)

var1 var2 var3

TRUE TRUE FALSE

> df[, !apply(is.na(df), 2, all)]

var1 var2

1 1 1

2 2 2

3 3 1

4 4 3

5 5 4

6 6 NA

7 7 NA

8 NA 9

3. Row which contains all column values that are missing

Suppose if you want to remove all column values contains NA then following codes will be handy.

Method 1:Using is.na(), rowSums() & ncol() Functions

df=data.frame(Col1=c(NA,"B","C","D",

"P1","P2","P3")

,Col2=c(NA,8,NA,9,10,8,9)

,Col3=c(NA,7,6,8,NA,7,8)

,Col4=c(NA,NA,7,7,NA,7,7))

df[rowSums(is.na(df)) != ncol(df), ]

Col1 Col2 Col3 Col4

2 B 8 7 NA

3 C NA 6 7

4 D 9 8 7

5 P1 10 NA NA

6 P2 8 7 7

7 P3 9 8 7

Method 2:Using Using filter() Function

df=data.frame(Col1=c(NA,"B","C","D",

"P1","P2","P3")

,Col2=c(NA,8,NA,9,10,8,9)

,Col3=c(NA,7,6,8,NA,7,8)

,Col4=c(NA,NA,7,7,NA,7,7))

library("dplyr")

filter(df, rowSums(is.na(df)) != ncol(df))

Col1 Col2 Col3 Col4

1 B 8 7 NA

2 C NA 6 7

3 D 9 8 7

4 P1 10 NA NA

5 P2 8 7 7

6 P3 9 8 7

Replace NA with 0

SurDRRA[is.na(SurDRRA)] <- 0

b <- list(`MacKO:HetC_DIET,down` = 9:33, `MacKO:HetC_DIET,up` = 1:8, `KO:WT_DIET,up` = c(1:9, 13, 14, 18, 28:30, 32, 34:42), `KO:WT_DIET,down` = c(21, 23))

ggvenn(b, set_name_size = 3, text_size = 4, set_name_color = c("blue", "yellow4", "green4", "red")) + coord_cartesian(xlim = c(-3, 3), ylim = c(-3, 3))

ggvenn(a, set_name_size = 3, set_name_color = c("blue", "yellow4", "green4", "red"), text_size = 4, stroke_size = 0.5) + coord_cartesian(xlim = c(-3, 3), ylim = c(-3, 3)) + labs(title = "Stroma hallmark pathway")

ggvenn(a, fill_color = c("#F8766D", "#00BFC4"), text_size = 6, set_name_color = c("#F8766D", "#00BFC4"), set_name_size = 8)

heatmap

d <- data.table::fread("C:/Users/?/Desktop/D +vs-R TCIA.txt.gz", data.table = F)

data <- d %>% column_to_rownames("celltype")

ddd <- as.matrix(data)

heatmap(ddd, cexRow = 1, cexCol = 1, labRow = d$celltype, margins = c(8, 10))

pheatmap(ddd)

data <- d %>% column_to_rownames("celltype")

ddd <- as.matrix(data)

pheatmap(ddd,cluster_cols = F,cluster_rows = F,main = "ADM safeTME",

color=colorRampPalette(c("navy", "white", "red"))(50),

cutree_cols=2)

pheatmap(ddd,cluster_cols = F,cluster_rows = F,main = "ADM safeTME",

color=colorRampPalette(c("#00BFC4", "white", "#F8766D"))(50),

cutree_cols=2)

Spatial Decon

temp = replace(restils$prop_of_nontumor, is.na(restils$prop_of_nontumor), 0)

o = hclust(dist(temp))$order

layout(mat = (matrix(c(1, 2), 1)), widths = c(7, 3))

TIL_barplot(ddd, draw_legend = TRUE, cex.names = 0.5)

d <- data.table::fread("C:/Users/?/Desktop/Til/DTil/DTilDM1difference.txt.gz", data.table = F)

data <- d %>% column_to_rownames("celltype")

ddd <- as.matrix(data)

temp = replace(restils$prop_of_nontumor, is.na(restils$prop_of_nontumor), 0)

o = hclust(dist(temp))$order

layout(mat = (matrix(c(1, 2), 1)), widths = c(7, 3))

TIL_barplot(ddd, draw_legend = TRUE, cex.names = 0.5, main = "D DM P vs Pre TME cell type difference")

To access any other kind of file with compression, simply use gzfile("") around the file name:

write.table(tst.df,gzfile("test.dat.gz")) # write a compressed file read.table(gzfile("test.dat.gz"),row.names=1)# read it back in

results <- cibersort(HNSC, q_Norm, perm = 100, QN=TRUE)

pheatmap

Error in cut.default(x, breaks = breaks, include.lowest = T) :

'x' must be numeric

D5 <- apply(as.matrix(d4), 2, as.numeric)

Labels <- c(rep("R", 6),rep("P", 5),rep("N", 5),rep("N_P", 5))

names(Labels) <- rownames(d5)

Pos_DM <- as.data.frame(Labels)

p1 <- one.step.pigengene(Data=d1,saveDir='pigengene', bnNum=0, verbose=1,

seed=1, Labels=Labels, toCompact=FALSE, doHeat=FALSE)

h1 <- pheatmap.type(d5[,1:20],annRow= Pos_DM,show_rownames=FALSE)

ggplot(b, aes(fill=Cell.state, z.scores, x=celltype), size = 12) +

geom_bar(position="stack", stat="identity") + + theme(axis.text.x = element_text(angle = 90, hjust = 1))

Cell State _Negative _Neg Cell type Abundance _Positive _Pos Cell type Abundance

S01 _Neg_B cells_NR_P 0.00150677 _ Pos_B cells_NR_P 0.171670673

S02 _Neg_B cells_NR_P 0.088171129 _ Pos_B cells_NR_P 0.088419299

S03 _Neg_B cells_NR_P 0.057669946 _ Pos_B cells_NR_P 0.107745723

S04 _Neg_B cells_NR_P 0.032971537 _ Pos_B cells_NR_P 0.072909367

S05 _Neg_B cells_NR_P 0.047689777 _ Pos_B cells_NR_P 0.133199207

S01 _Neg_B cells_NR_Pre 0.019111569 _ Pos_B cells_NR_Pre 0.22481643

S02 _Neg_B cells_NR_Pre 0.061897723 _ Pos_B cells_NR_Pre 0.0780146

S03 _Neg_B cells_NR_Pre 0.091447498 _ Pos_B cells_NR_Pre 0.05800381

S04 _Neg_B cells_NR_Pre 0.07043731 _ Pos_B cells_NR_Pre 0.091037166

S05 _Neg_B cells_NR_Pre 0.061042366 _ Pos_B cells_NR_Pre 0.131287177

S01 _Neg_B cells_R_P 4.90E-05 _ Pos_B cells_R_P 0.072245259

S02 _Neg_B cells_R_P 0.113785657 _ Pos_B cells_R_P 0.084129808

S03 _Neg_B cells_R_P 0.102182471 _ Pos_B cells_R_P 0.144895115

S04 _Neg_B cells_R_P 0.044155086 _ Pos_B cells_R_P 0.114542694

S05 _Neg_B cells_R_P 0.119914577 _ Pos_B cells_R_P 0.058223155

library(circlize)

x = rnorm(2600)

factors = sample(letters, 2600, replace = TRUE)

circos.initialize(factors = factors, x = x)

circos.trackHist(factors = factors, x = x, track.height = 0.1, col = "#999999", border = "#999999")

circos.trackHist(factors = factors, x = x, force.ylim = FALSE, bin.size = 0.1, track.height = 0.1, col = "#999999", border = "#999999")

circos.trackHist(factors = factors, x = x, draw.density = TRUE, track.height = 0.1, col = "#999999", border = "#999999")

circos.trackHist(factors = factors, x = x, draw.density = TRUE, force.ylim = FALSE, track.height = 0.1, col = "#999999", border = "#999999")

par(mfrow = c(1, 2))

circos.initialize(letters[1:4], xlim = c(0, 10))

circos.track(ylim = c(0, 1), panel.fun = function(x, y) {

value = runif(10)

circos.barplot(value, 1:10 - 0.5, col = 1:10)

})

circos.track(ylim = c(-1, 1), panel.fun = function(x, y) {

value = runif(10, min = -1, max = 1)

circos.barplot(value, 1:10 - 0.5, col = ifelse(value > 0, 2, 3))

})

circos.clear()

circos.initialize(letters[1:4], xlim = c(0, 10))

circos.track(ylim = c(0, 4), panel.fun = function(x, y) {

value = matrix(runif(10*4), ncol = 4)

circos.barplot(value, 1:10 - 0.5, col = 2:5)

})

https://jokergoo.github.io/circlize_book/book/circos-heatmap.html#circos-heatmap-input-data

https://bioinformatics.ccr.cancer.gov/docs/data-visualization-with-r/Lesson5_intro_to_ggplot/

#data import from excel

data<-readxl::read_xlsx("./data/RNASeq_totalcounts_vs_totaltrans.xlsx",

1,.name_repair = "universal")

Readxl’s default is .name_repair = "unique"

scatter_plot<-ggplot(data=data) +

geom_point(aes(x=Number.of.Transcripts, y = Total.Counts,

color=Treatment))

scatter_plot +

scale_color_manual(values=c("red","black"),

labels=c('treated','untreated'))

scatter_plot +

scale_color_grey()

To apply scale_fill to shape, we will have to alter the shapes, as only some shapes take a fill argument.

Image from https://ggplot2.tidyverse.org/articles/ggplot2-specs.html:

R Shapes

ggplot(data=data) +

geom_point(aes(x=Number.of.Transcripts, y = Total.Counts,fill=Treatment),

shape=21,size=2) + #increase size and change points

scale_fill_manual(values=c("purple", "yellow"))

## Class Sex Age Survived Freq

## 1 1st Male Child No 0

## 2 2nd Male Child No 0

## 3 3rd Male Child No 35

## 4 Crew Male Child No 0

## 5 1st Female Child No 0

## 6 2nd Female Child No 0

ggplot() + geom_col(data = Titanic, aes(x = Class, y = Freq, fill = Survived), position = "dodge") +

facet_grid(Sex ~ Age)

ggplot(data=data) +

geom_point(aes(x=Number.of.Transcripts, y = Total.Counts,

fill=Treatment),

shape=21,size=2) +

scale_fill_manual(values=c("purple", "yellow"),

labels=c('treated','untreated'))+

#can change labels of fill levels along with colors

xlab("Recovered transcripts per sample") + #add x label

ylab("Total sequences per sample") +#add y label

guides(fill = guide_legend(title="Treatment")) + #label the legend

scale_y_continuous(trans="log10") + #log transform the y axis

theme_bw()

ggsave("Plot1.png",width=5.5,height=3.5,units="in",dpi=300)

http://timer.comp-genomics.org/timer/

performed GSEA with MSigDB sets and used gseaplot2 to visualize the results:

display_geneset <- "IVANOVA_HEMATOPOIESIS_STEM_CELL_LONG_TERM"

p_msig <- gseaplot2(gse_cgp, geneSetID = display_geneset, title = gse_cgp$Description[gse_cgp$ID == display_geneset],

color = "green", base_size = 5) + theme( aspect.ratio = 1/1.618)

p_msig

ggsave("output/clusterProfiler_GSEA_IVANOVA_small.svg", width=6, units = "cm", scale = 1)

Example 1: S4 Class and Object in R

# create a class "Student_Info" with three member variables

setClass("Employee_Info", slots=list(name="character", age="numeric", role="character"))

# create an object of class

employee1 <- new("Employee_Info", name = "Peter", age = 21, role = "Developer")

# call employee1 object

employee1

https://rpubs.com/pranali018/enrichmentAnalysis

library(enrichplot)

library(ggplot2)

library("org.Hs.eg.db")

library(clusterProfiler)

library(pathview)

require(DOSE)

dotplot(gse, showCategory=3, split=".sign") + facet_grid(.~.sign)

#remove any rows where GeneSymbol is not present

num = which(is.na(res$GeneSymbol))

res = res[-num, ]

# we want the log2 fold change

original_gene_list = res$log2FoldChange

# name the vector

names(original_gene_list) <- res$GeneSymbol

# omit any NA values

gene_list<-na.omit(original_gene_list)

# sort the list in decreasing order (required for clusterProfiler)

gene_list = sort(gene_list, decreasing = TRUE)

gse <- gseGO(geneList=gene_list,

ont ="ALL",

keyType = "SYMBOL",

minGSSize = 3,

maxGSSize = 800,

pvalueCutoff = 0.05,

verbose = TRUE,

OrgDb = org.Hs.eg.db,

pAdjustMethod = "none")

out = as.matrix(gse@result)

dotplot(gse, showCategory=3, split=".sign") + facet_grid(.~.sign)

The dotplot shows the ontology terms which are either activated (up regulated) or suppresed(down regulated) in the dataset. The number of ontology terms per feature (BP, MF, CC) to be displayed is determined by the showCateory function. The color of the dots indicate the pvalues for each of the terms and the size of the dots depicts the number of genes overlapping in each feature

x2 = pairwise_termsim(gse)

emapplot(x2, showCategory = 20)

Enrichment map organizes enriched terms into a network with edges connecting overlapping gene sets. In this way, mutually overlapping gene sets are tend to cluster together, making it easy to identify functional modules.

# Use the `Gene Set` param for the index in the title, and as the value for geneSetId

gseaplot(gse, by = "all", title = gse$Description[1], geneSetID = 1)

Plot of the Running Enrichment Score (green line) for a gene set as the analysis walks down the ranked gene list, including the location of the maximum enrichment score (the red line). The black lines in the Running Enrichment Score show where the members of the gene set appear in the ranked list of genes, indicating the leading edge subset.

The Ranked list metric shows the value of the ranking metric (log2 fold change) as you move down the list of ranked genes. The ranking metric measures a gene’s correlation with a phenotype.

gseaplot(kk2, by = "all", title = kk2$Description[19], geneSetID = 19)

r1 = pathview(gene.data = kegg_gene_list, pathway.id = "hsa00190", species = kegg_organism)

gene_list = sort(ranks, decreasing = TRUE)

hallmark <- GSEA(

geneList = gene_list,

exponent = 1,

minGSSize = 10,

maxGSSize = 500,

eps = 1e-10,

pvalueCutoff = 0.05,

pAdjustMethod = "BH",

gson = NULL,

TERM2GENE = hall,

TERM2NAME = NA,

verbose = TRUE,

seed = FALSE,

by = "fgsea")

gseaplot(kk2, by = "all", title = kk2$Description[1], geneSetID = 1)

DESeqAnalysis

if (!requireNamespace("BiocManager", quietly = TRUE)) {

install.packages("BiocManager")

}

install.packages(

pkgs = "DESeqAnalysis",

repos = c(

"https://r.acidgenomics.com",

BiocManager::repositories()

dependencies = TRUE

)

https://rdrr.io/github/acidgenomics/DESeqAnalysis/man/DESeqAnalysis.html

resultsNames(data)

name <- resultsNames(data)[[2L]]

results <- DESeq2::results(data, name = name)

class(results)

lfcShrink <- DESeq2::lfcShrink(dds = data, res = results, coef = 2L)

results <- list(results)

names(results) <- name

lfcShrink <- list(lfcShrink)

names(lfcShrink) <- name

object <- DESeqAnalysis(

data = data,

transform = transform,

results = results,

lfcShrink = lfcShrink

)

plotDegStackedBar(

object,

i = 1L,

direction = c("both", "up", "down"),

orderBySize = FALSE,

label = TRUE,

flip = TRUE

)

assay(object@transform)

assay(object@data)

results(object, i = 1L)

sampleData(object)

genes <- head(rownames(object), 50L)

head(genes)

samples <- head(colnames(object), 2L)

head(samples)

x <- object[genes, samples]

mai <- assay(vsdata)[HY, ]

mai <- as.data.frame(mai)

ma <- res %>% filter((GeneSymbol %in% HY) & pvalue < 0.05)

mai1 <- mai %>% mutate(rownames(mai))

names(mai1)[11] <- "GeneSymbol"

mai2 <- merge(mai1, ma)

mai3 <- mai2 %>% arrange(desc(TD23_2))

mai4 <- apply(as.matrix(mai3[, 1:11]), 2, as.numeric)

rownames(mai4) <- mai3$GeneSymbol

df <- as.data.frame(colData(vsdata)[, c("ID", "Group")])

pheatmap(mai4[,-1], annotation_col=df, cluster_rows=F, cluster_cols=F, main = "HALLMARK_HYPOXIA")

mai <- assay(vsdata)[IN, ]

mai <- as.data.frame(mai)

ma <- res %>% filter((GeneSymbol %in% IN) & pvalue < 0.05)

mai1 <- mai %>% mutate(rownames(mai))

names(mai1)[10] <- "GeneSymbol"

mai2 <- merge(mai1, ma)

mai3 <- mai2 %>% arrange(desc(TD21_2))

mai4 <- apply(as.matrix(mai3[, 1:10]), 2, as.numeric)

rownames(mai4) <- mai3$GeneSymbol

df <- as.data.frame(colData(vsdata)[, c("ID", "Group")])

pheatmap(mai4[,-1], annotation_col=df, cluster_rows=F, cluster_cols=F, main = " -NR HALLMARK_TNFA_SIGNALING_VIA_NFKB ")

mai <- assay(vsdata)[IN, ]

mai <- as.data.frame(mai)

ma <- res %>% filter((GeneSymbol %in% IN) & pvalue < 0.05)

mai1 <- mai %>% mutate(rownames(mai))

names(mai1)[24] <- "GeneSymbol"

mai2 <- merge(mai1, ma)

mai3 <- mai2 %>% arrange(desc(CK .R33.RM29P))

mai4 <- apply(as.matrix(mai3[, 1:24]), 2, as.numeric)

rownames(mai4) <- mai3$GeneSymbol

df <- as.data.frame(colData(vsdata)[, c("ID", " Treatment")])

pheatmap(mai4[,-1], annotation_col=df, cluster_rows=F, cluster_cols=F, main = " NFKB ")

gseaplot2(

gse,

geneSetID = 876,

title = gse$Description[876],

color = "green",

base_size = 11,

rel_heights = c(1.5, 0.5, 1),

subplots = 1:3,

pvalue_table = FALSE,

ES_geom = "line"

)

Point label

https://bookdown.org/ndphillips/YaRrr/low-level-plotting-functions.html

Connecting points with segments().

before <- c(2.1, 3.5, 1.8, 4.2, 2.4, 3.9, 2.1, 4.4)

after <- c(7.5, 5.1, 6.9, 3.6, 7.5, 5.2, 6.1, 7.3)

# Create plotting space and before scores

plot(x = rep(1, length(before)),

y = before,

xlim = c(.5, 2.5),

ylim = c(0, 11),

ylab = "Score",

xlab = "Time",

main = "Using segments() to connect points",

xaxt = "n")

# Add after scores

points(x = rep(2, length(after)), y = after)

# Add connections with segments()

segments(x0 = rep(1, length(before)),

y0 = before,

x1 = rep(2, length(after)),

y1 = after,

col = gray(0, .5))

# Add labels

mtext(text = c("Before", "After"),

side = 1, at = c(1, 2), line = 1)

Connecting points with segments().

Figure 11.10: Connecting points with segments().

plot(x = pirates$height,

y = pirates$weight,

pch = 16,

col = transparent("skyblue", .7),

main = "Adding a regression line to a scatterplot()")

# Add the regression line

abline(lm(weight ~ height, data = pirates),

lty = 2)

plot(x = height,

y = weight,

xlim = c(155, 180),

ylim = c(65, 80),

pch = 16,

col = yarrr::piratepal("xmen"))

# Add id labels

text(x = height,

y = weight,

labels = id,

pos = 3) # Put labels above the points

Adding labels to points with text()

# Create the plot

plot(x = ChickWeight$Time,

y = ChickWeight$weight,

col = gray(.3, .5),

pch = 16,

main = "Combining text with numeric scalers using paste()")

# Add reference line

abline(h = mean(ChickWeight$weight),

lty = 2)

# Add text

text(x = 3,

y = mean(ChickWeight$weight),

labels = paste("Mean weight =",

round(mean(ChickWeight$weight), 2)),

pos = 3)

layout(mat = matrix(c(2, 1, 0, 3),

nrow = 2,

ncol = 2),

heights = c(1, 2), # Heights of the two rows

widths = c(2, 1)) # Widths of the two columns

# Plot 1: Scatterplot

par(mar = c(5, 4, 0, 0))

plot(x = pirates$height,

y = pirates$weight,

xlab = "height",

ylab = "weight",

pch = 16,

col = yarrr::piratepal("pony", trans = .7))

# Plot 2: Top (height) boxplot

par(mar = c(0, 4, 0, 0))

boxplot(pirates$height, xaxt = "n",

yaxt = "n", bty = "n", yaxt = "n",

col = "white", frame = FALSE, horizontal = TRUE)

# Plot 3: Right (weight) boxplot

par(mar = c(5, 0, 0, 0))

boxplot(pirates$weight, xaxt = "n",

yaxt = "n", bty = "n", yaxt = "n",

col = "white", frame = F)

Adding boxplots to margins of a scatterplot with layout().

cor.test(formula = ~ Zyx + Zzz3, data = b2)

Pearson's product-moment correlation

data: Zyx and Zzz3

t = 1.265, df = 15, p-value = 0.2252

alternative hypothesis: true correlation is not equal to 0

95 percent confidence interval:

-0.2000037 0.6884001

sample estimates:

cor

0.3104886

d$ID[duplicated(d$ID)]

[1] "SLC35E2"

> which(duplicated(d$ID))

[1] 16273

> which(d$ID == "SLC35E2")

[1] 16272 16273

subset(x = ToothGrowth,

subset = len > 30 & supp == "VC",

select = c(len, dose))

Table 9.1: Functions for managing your workspace, working directory, and writing data from R as .txt or .RData files, and reading files into R

Code Description

ls() Display all objects in the current workspace

rm(a, b, ..) Removes the objects a, b… from your workspace

rm(list = ls()) Removes all objects in your workspace

getwd() Returns the current working directory

setwd(file = "dir) Changes the working directory to a specified file location

list.files() Returns the names of all files in the working directory

write.table(x, file = "mydata.txt", sep) writes the object x to a text file called mydata.txt. Define how the columns will be separated with sep (e.g.; sep = "," for a comma–separated file, and sep = t" for a tab–separated file).

save(a, b, .., file = "myimage.RData) Saves objects a, b, … to myimage.RData

save.image(file = "myimage.RData") Saves all objects in your workspace to myimage.RData

load(file = "myimage.RData") Loads objects in the file myimage.RData

read.table(file = "mydata.txt", sep, header) Reads a text file called mydata.txt, define how columns are separated with sep (e.g. sep = "," for comma-delimited files, and sep = "t" for tab-delimited files), and whether there is a header column with header = TRUE

save(a, b, c, file = "myobjects.RData")

save.image(file = "data/projectimage.RData")

study1.df <- data.frame(id = 1:5,

sex = c("m", "m", "f", "f", "m"),

score = c(51, 20, 67, 52, 42))

score.by.sex <- aggregate(score ~ sex,

FUN = mean,

data = study1.df)

study1.htest <- t.test(score ~ sex,

data = study1.df)

mydata <- read.table(file = 'data/mydata.txt', # file is in a data folder in my working directory

sep = 't', # file is tab--delimited

header = TRUE, # the first row of the data is a header row

stringsAsFactors = FALSE) # do NOT convert strings to factors!!

# Read a text file from the web

fromweb <- read.table(file = 'http://goo.gl/jTNf6P',

sep = 't',

header = TRUE)

pirates <- pirates[order(pirates$sex, pirates$height),]

combined.survey <- merge(x = risk.survey,

y = happiness.survey,

by = "participant",

all = TRUE)

The symbol types associated with the pch plotting parameter.

hist(x = ChickWeight$weight[ChickWeight$Diet == 1],

main = "Two Histograms in one",

xlab = "Weight",

ylab = "Frequency",

breaks = 20,

xlim = c(0, 500),

col = gray(0, .5))

hist(x = ChickWeight$weight[ChickWeight$Diet == 2],

breaks = 30,

add = TRUE, # Add plot to previous one!

col = gray(1, .8))

barplot(height = 1:5, # A vector of heights

names.arg = c("G1", "G2", "G3", "G4", "G5"), # A vector of names

main = "Example Barplot",

xlab = "Group",

ylab = "Height")

swim.data

## Naked Clothed

## No Shark 2.1 1.5

## Shark 3.0 3.0

Now, when I enter this matrix as the height = swim.data argument to barplot(), I’ll get multiple bars.

barplot(height = swim.data,

beside = TRUE, # Put the bars next to each other

legend.text = TRUE, # Add a legend

col = c(transparent("green", .2),

transparent("red", .2)),

main = "Swimming Speed Experiment",

ylab = "Speed (in meters / second)",

xlab = "Clothing Condition",

ylim = c(0, 4))

https://cran.r-project.org/web/packages/pals/vignettes/bivariate_choropleths.html

https://rfunctions.blogspot.com/2015/03/bivariate-maps-bivariatemap-function.html

WGCNA

https://fuzzyatelin.github.io/bioanth-stats/module-F21-Group1/module-F21-Group1.html

d <- dat1 %>% separate_wider_delim(`X|Y`, delim = "|", names = c("X", "Y"))

https://rdrr.io/cran/DGEobj.utils/man/convertCounts.html

https://alexslemonade.github.io/refinebio-examples/03-rnaseq/pathway-analysis_rnaseq_03_gsva.html

library(vtable)

dd <- st(dat, group = 'Condition', group.test = TRUE)

dat1 <- dat[, 2:52] %>%

gather(key = variable, value = value, - Condition) %>%

group_by(Condition, variable) %>%

summarise(value = list(value)) %>%

spread(Condition, value) %>%

group_by(variable) %>%

mutate(p_value = t.test(unlist(TT), unlist(NT))$p.value,

t_value = t.test(unlist(TT), unlist(NT))$statistic)

library(annotables)

grch38 %>% filter(ensgene %in% )

ANGIOGENESIS_lm <- lm(HALLMARK_ANGIOGENESIS ~ MFP + _status, data = HALL_clinic_TME)

ANGIOGENESIS.II.aov <- car::Anova(ANGIOGENESIS_lm, type = 2)

Correlation

https://statsandr.com/blog/correlation-coefficient-and-correlation-test-in-r/

library("ggpubr")

ggscatter(CE9F, x = "CE10", y = "OS",

add = "reg.line", conf.int = TRUE,

cor.coef = TRUE, cor.method = "pearson",

xlab = "CE10_F", ylab = "OS")

cor.test(formula = ~ CE9 + OS,

data = Pos_CE1,

subset = MFP == "D")

t.test(formula = CE10 ~ MFP,

data = CE9IE_F1)

CE1 CE2 CE3 CE4

TCGA-BA-5152 0.15368566 0.276724867 0.19438770 0.044953297

TCGA-BA-5153 0.09342664 0.046838160 0.02334477 0.006748026

TCGA-BA-5559 0.07732019 0.006907233 0.07898090 0.252107417

round(cor(dat), 2)

CE1 CE2 CE3 CE4 CE5 CE6 CE7 CE8 CE9 CE10 OS

CE1 1.00 0.12 0.20 0.07 -0.35 0.15 -0.26 -0.60 -0.26 -0.30 -0.08

CE2 0.12 1.00 0.03 -0.10 -0.34 -0.38 0.13 -0.14 -0.34 -0.52 -0.08

CE3 0.20 0.03 1.00 0.02 -0.35 -0.03 -0.35 -0.57 0.28 -0.01 -0.20

CE4 0.07 -0.10 0.02 1.00 -0.06 0.32 -0.42 -0.21 -0.27 -0.25 -0.07

CE5 -0.35 -0.34 -0.35 -0.06 1.00 0.10 -0.05 0.57 -0.19 -0.09 -0.14

CE6 0.15 -0.38 -0.03 0.32 0.10 1.00 -0.23 -0.16 -0.31 0.00 0.00

CE7 -0.26 0.13 -0.35 -0.42 -0.05 -0.23 1.00 0.48 -0.15 -0.17 0.10

CE8 -0.60 -0.14 -0.57 -0.21 0.57 -0.16 0.48 1.00 -0.19 -0.10 0.12

CE9 -0.26 -0.34 0.28 -0.27 -0.19 -0.31 -0.15 -0.19 1.00 0.54 0.07

CE10 -0.30 -0.52 -0.01 -0.25 -0.09 0.00 -0.17 -0.10 0.54 1.00 0.21

OS -0.08 -0.08 -0.20 -0.07 -0.14 0.00 0.10 0.12 0.07 0.21 1.00

ggplot(dat) +

aes(x = CE1, y = CE2) +

geom_point(colour = "#0c4c8a") +

theme_minimal()

pairs(dat[, c("CE9", "CE10", "OS")])

library(corrplot)

corrplot(cor(dat),

method = "number",

type = "upper" # show only upper side

)

correlation tests for whole dataset

library(Hmisc)

res <- rcorr(as.matrix(dat)) # rcorr() accepts matrices only

# display p-values (rounded to 3 decimals)

round(res$P, 3)

library(ggstatsplot)

ggscatterstats(

data = dat,

x = CE1,

y = CE2,

bf.message = FALSE,

marginal = FALSE # remove histograms

)

library(correlation)

correlation::correlation(dat,

include_factors = TRUE, method = "auto"

)

[ 打印 ]

[ 加入书签 ]

阅读 () ┆ 评论

目前还没有任何评论

登录后才可评论.