24种R语言新手入门之小提琴图(三)(r语言入门基础教程)
 南窗  分类:IT技术  人气:129  回帖:0  发布于1年前 收藏

一、前言

柱状图和箱线图的代码能理解了其实发现好多作图都是可以触类旁通的,小提琴图作为科研结果常用展示图也不可或缺,用ggplot或者vioplot。

二、基本图形

2.1 基本小提琴图

library("vioplot")
#准备数据
ToothGrowth$dose <- as.factor(ToothGrowth$dose)
p <- ggplot(ToothGrowth, aes(x=dose, y=len)) + 
  geom_violin(trim=T) #T改为F即完整的小提琴图
p
#选择需要显示的x项
p + scale_x_discrete(limits=c("0.5", "2"))

2.2 添加数值

#添加中位值median,均值改mean
p + stat_summary(fun.y=mean, geom="point", shape=23, size=2, color="red")
#中位数和四分位数
p + geom_boxplot(width=0.1)
#均值和标准差
p <- ggplot(ToothGrowth, aes(x=dose, y=len)) + 
  geom_violin(trim=FALSE)
p + stat_summary(fun.data=mean_sdl, mult=2, 
                 geom="pointrange", color="red")

2.3 添加散点

#带点小提琴图
p + geom_dotplot(binaxis='y', stackdir='center', dotsize=1)
#点发散
p + geom_jitter(shape=16, position=position_jitter(0.2))

2.4 上色

定义颜色的代码跟前两章柱状图箱线图的一样,都对应着线框颜色和填充颜色。

2.5 柱状小提琴复合图

#准备数据
set.seed(1)
n <- 10000
ii <- rbinom(n, 1, 0.5)
data <- rnorm(n, mean = 130, sd = 10) * ii +
  rnorm(n, mean = 80, sd = 5) * (1 - ii)
#柱状图
hist(data, probability = TRUE, 
     col = "grey", axes = FALSE,
     main = "", xlab = "",  ylab = "")
axis(1)
#密度线
lines(density(data), lwd = 2, col = "red")
#添加小提琴图
par(new = TRUE)
vioplot(data, horizontal = TRUE, 
        yaxt = "n", axes = FALSE,
        col = ggsci::pal_npg("nrc",alpha = 0.2)(1)) 

2.6 拆分小提琴图

#准备数据,分组
data <- trees
tall <- trees[trees$Height >= 76, ]
small <- trees[trees$Height < 76, ]
#绘图
vioplot(tall, side = "left", plotCentre = "line", col = "#3B4992B2")
vioplot(small, side = "right", plotCentre = "line", col = "#EE0000B2", add = TRUE)
#图例
legend("topleft", legend = c("Tall", "Small"), fill = c("#3B4992B2","#EE0000B2"), cex = 1.25)

2.7 排序

#分画布
par(mfrow = c(1, 2))
data <- chickwts

#低到高
medians <- reorder(data$feed, data$weight, median)
#medians <- with(data, reorder(feed, weight, median)) # 也可用这行,效果一样的,下面同理
vioplot(data$weight ~ medians, col = 2:(length(levels(data$feed)) + 1),
        xlab = "", ylab = "Weight", las = 2)
#高到低
medians <- reorder(data$feed, -data$weight, median)
vioplot(data$weight ~ medians, col = 2:(length(levels(data$feed)) + 1),
        xlab = "", ylab = "Weight", las = 2)

三、进阶画图

3.1 多基因组间小提琴图

library("ggpubr")
library("reshape2")
norcol="#EE0000FF"                    #疾病组
concol="#3B4992FF"                   #正常组

#读取文件
rt=read.table(inputFile,header=T,sep="\t",check.names=F,row.names=1)
x=colnames(rt)[1]
colnames(rt)[1]="Type"

#转换分类矩阵数据
data=melt(rt,id.vars=c("Type"))
colnames(data)=c("Type","Gene","Expression")

#绘图
pdf(file=outFile, width=6, height=5)
p=ggviolin(data, x="Gene", y="Expression", color = "Type", 
           ylab="Gene expression",
           xlab="Gene",
           #legend.title=x,
           add.params = list(fill="white"),
           palette = c(concol,norcol),
           width=1, add = "boxplot")
#p=p+rotate_x_text(60)
p1=p+stat_compare_means(aes(group=Type),
                        method="wilcox.test", #可换其他统计方法
                        symnum.args=list(cutpoints = c(0, 0.001, 0.01, 0.05, 1), symbols = c("***", "**", "*", " ")),
                        label = "p.signif")
print(p1)
dev.off()

3.2 单基因多组小提琴图

library(ggpubr) 
library(ggsci) 
ggsci::pal_tron("legacy")(4)
color=c("#FF410DFF","#6EE2FFFF","#F7C530FF","#95CC5EFF")

#读取文件
rt=read.table(inputFile,header=T,sep="\t",check.names=F)
x=colnames(rt)[2]
y=colnames(rt)[3]
colnames(rt)=c("id","Type","Expression")

#设置分组
group=levels(factor(rt$Type))
rt$Type=factor(rt$Type, levels=group)
comp=combn(group,2)
my_comparisons=list()
for(i in 1:ncol(comp)){my_comparisons[[i]]<-comp[,i]}

#绘图
pdf(file=outFile, width=6, height=5)
ggviolin(rt, x="Type", y="Expression", fill = "Type", 
         xlab=x, ylab=paste0(y,"  expression"),
         legend.title=x,
         palette = color,
         add = "boxplot", add.params = list(fill="white"))+ 
#stat_compare_means(comparisons = my_comparisons) #显示具体数值
stat_compare_means(comparisons = my_comparisons,
                   symnum.args=list(cutpoints = c(0, 0.001, 0.01, 0.05, 1), 
                                    symbols = c("***", "**", "*", "ns")),
                   label = "p.signif") #显著替换成星号
dev.off()

3.3 多基因多组小提琴图

library(ggplot2)
library(reshape2)
color = c("#FF410DFF","#6EE2FFFF","#F7C530FF","#95CC5EFF")

#读取文件
rt=read.table(inputFile, header=T,sep="\t",check.names=F,row.names=1)
x=colnames(rt)[1]
colnames(rt)[1]="Type"

#计算组间差异
geneSig=c("")
for(gene in colnames(rt)[2:ncol(rt)]){
  rt1=rt[,c(gene,"Type")]
  colnames(rt1)=c("expression","Type")
  p=1
  if(length(levels(factor(rt1$Type)))>2){
    test=kruskal.test(expression ~ Type, data = rt1)
    p=test$p.value
  }else{
    test=wilcox.test(expression ~ Type, data = rt1)
    p=test$p.value
  }
  Sig=ifelse(p<0.001,"***",ifelse(p<0.01,"**",ifelse(p<0.05,"*","")))
  geneSig=c(geneSig,Sig)# 根据p值显著标星号
}
colnames(rt)=paste0(colnames(rt),geneSig) 

#转换数据类型
data=melt(rt,id.vars=c("Type"))
colnames(data)=c("Type","Gene","Expression")

#绘图
pdf(file=outFile, width=9, height=5)
p1=ggplot(data,aes(x=Type,
                   y=Expression,
                   fill=Type))+
  guides(fill=guide_legend(title=x))+
  labs(x = x, y = "Gene expression")+ #y轴
  geom_violin()+ 
  scale_fill_manual(values = color) +  geom_boxplot(width=0.2,position=position_dodge(0.9))+ 
  facet_wrap(~Gene,nrow =1)+ 
  theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p1)
dev.off()

四、讨论

小提琴图和箱线图其实大同小异,都是常用于展示基因组间差异表达,很基础的描述性统计可视化图。和violinplot区分开来,部分代码不同,但是效果和目的一样。每个图都可以自己换线框、填充颜色透明度等,发文不重复。

讨论这个帖子(0)垃圾回帖将一律封号处理……