1.rename方便后续处理。
2.trim_galore去接头。
#这个软件使用之前要先安装fastqc和cutadapt
ls -d OV*|while read OV; do echo $OV;
trim_galore -q 20 --phred33 --paired -a AGATCGGAAGAGCACACGTCT -a2 GATCGTCGGACTGTAGAACTCTGAAC \
-e 0.01 -o /data/XXXXX/task_OV/01trim/ ${OV}/*_R1.fastq.gz ${OV}/*_R2.fastq.gz>>${OV}.log; done
3.fastqc看去接头情况
fastqc -o /data/XXXXX/task_OV/02fastqc -t 10 *.fq.gz
multiqc -o /data/XXXXX/task_OV/02fastqc/multiqc *_fastqc.zip
4.bwa比对
for(( i=1; i<157; i++ ))
do
bwa mem /data/XXXXX/WGS/02hg19/chrom.37.fa \
OV${i}_R1_val_1.fq.gz \
OV${i}_R2_val_2.fq.gz > /data/XXXXX/task_OV/03bwa/OV${i}.sam
done
5.samtools
#SAM2BAM
for(( i=1; i<157; i++ ))
do
samtools view -bS OV${i}.sam > /data/XXXXX/task_OV/04samtools/OV${i}.bam
done
#SORT
for(( i=10; i<157; i++ ))
do
samtools sort OV${i}.bam /data/XXXXX/task_OV/05sorted/OV${i}.sort
done
#INDEX
for(( i=1; i<157; i++ ))
do
samtools index OV${i}.sort.bam
done
6.bedtools
bedtools multicov -f 0.9 -bams OV{1..157}.sort.bam -bed /data/XXXXX/task_LE-miR/03-2miRSeq/hsa.37note_mature.gff > differmiR.txt
7.delete repeat
sort rowname.txt |uniq -d >repeat.name.txt
#repeat
setwd("C:\\Users\\dell-xps\\Desktop")
repeat.name<-read.table("repeat.name.txt", header=TRUE)
differ <- read.table("differ2.txt", header=TRUE)
rowname <- read.table("rowname.txt", header=TRUE)
j<-NULL
for(i in 1:6708){
a <- as.vector(repeat.name[i,1])
h <- 0
g <- NULL
for(k in 1:99236){
b <- as.vector(rowname[k,1])
if(a==b){
c <- differ[k,]
h <- h+c
}
g <- cbind(a,h)
g <- as.vector(g)
}
j <- rbind(j,g)
}
write.table(j, "serum.iso.repeat.txt")
repeat.iso<-read.table("repeat.txt", header=TRUE)
differ <- read.table("differ.txt", header=TRUE)
for(i in 1:6708){
a <- as.vector(repeat.iso[i,1])
for(k in 1:99236){
b <- as.vector(differ[k,1])
if(a==b){
differ[k,]<-repeat.iso[i,]
}
}
}
write.table(differ, "serum.iso.repeat.plus.txt")
8.RPM+QC+NORMALIZATION+COMBAT+TEST(看foldchange)
#RPM
setwd("C:\\Users\\dell-xps\\Desktop")
differ <- read.table("differ2.txt", header=TRUE,row.names=1)
RPM <- NULL
for (i in 1:76){
colscore <- colSums(differ)
z <- differ[,i]/colscore[i]*1000000
RPM <- cbind(RPM,z)
}
RPM
write.table(RPM, "RPM.txt")
#QC
RPM<-read.table("RPM.txt",header=T,row.names=1)
h<-NULL
for (m in 1:1504){
i=0
g<-NULL
for (n in 1:67){
a<-RPM[m,n]
if(a==0){
i=i+1
}
}
if(i<17){
g<-RPM[m,]
h<-rbind(g,h)
}
}
write.table(h,"RPMqc.txt")
#NORMALIZATION
RPM <- read.table("RPM.txt", header=TRUE,row.names=1)
norRPM <- log2(RPM+1)
norRPM
write.table(norRPM, "norRPM.txt")
#COMBAT
norRPM <- read.table("norRPM.txt", header=TRUE,row.names=1)
a <- read.table("LEbatch.txt", header=TRUE,row.names=1)
batch = a$batch
library(sva)
norRPM <- as.matrix(norRPM)
combat = ComBat(dat=norRPM, batch=batch, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
write.table(combat, "aftercombat.txt")
t+group
#TEST(FDR)
# ## t-test(order)
d <- read.table("n.txt", header=TRUE,row.names=1)
e <- read.table("t.txt", header=TRUE,row.names=1)
h <- NULL
for(n in 2:490){
d2 <- d[,n]
e2 <- e[,n]
j <- t.test(d2,e2,paired = FALSE)
j2 <- j$p.value
j3 <-as.vector(j2)
h <- cbind(h,j3)
}
write.table(h, "tn.t.pvalue.txt")
# ## excel
# ## fdr
library("fdrtool")
k2 <- read.table("tn_pvalue.txt", header=TRUE,row.names=1)
l2<- fdrtool(as.vector(k2$t.pvalue), statistic="pvalue",cutoff.method="fndr")
m2 <- l2$qval
write.table(m2, "tn.t.padj.txt")
# ## w-test(order)
d <- read.table("n.txt", header=TRUE,row.names=1)
e <- read.table("t.txt", header=TRUE,row.names=1)
h <- NULL
for(n in 2:490){
d2 <- d[,n]
e2 <- e[,n]
j <- wilcox.test(e2,d2, paired=FALSE)
j2 <- j$p.value
j3 <-as.vector(j2)
h <- cbind(h,j3)
}
write.table(h, "tn.w.pvalue.txt")
# ## excel
# ## fdr
library("fdrtool")
k2 <- read.table("tn_pvalue.txt", header=TRUE,row.names=1)
l2<- fdrtool(as.vector(k2$w.pvalue), statistic="pvalue",cutoff.method="fndr")
m2 <- l2$qval
write.table(m2, "tn.w.padj.txt")
# ## p-test(order)
d <- read.table("t&n.txt", header=TRUE,row.names=1)
h <- NULL
library(coin)
for(n in 2:490){
d2 <- d[,n]
d3 <- d[,1]
data2<- data.frame(d2,d3)
j <- oneway_test(d2~d3,data=data2,distribution="approximate"(B=1000000))
j2 <- pvalue(j)
j3 <-as.vector(j2)
h <- cbind(h,j3)
}
write.table(h, "tn.p.pvalue.txt")
9.RANDOMFOREST
differ <-read.table("PCA1.txt",header=T,row.names=1)
# num
library("randomForest")
#data(iris,package='datasets')
set.seed(1500)
index<-sample(2,nrow(differ),replace=T,prob=c(0.7,0.3))
traindata<-differ[index==1,]
testdata<-differ[index==2,]
n<-ncol(differ)-1
errRate<-c(1)
for (i in 1:n){
m<-randomForest(group~.,data=differ,mtry=i,proximity=T)
err<-mean(m$err.rate)
errRate[i]<-err
}
print(errRate)
m=which.min(errRate)
print(m)
# m=5
# ntree
set.seed(150)
rf_ntree<-randomForest(group~.,data=differ)
plot(rf_ntree)
# ntree=350
m<-randomForest(group~.,data=traindata,mtry=5,ntree=350,proximity=T)
print(m)
importance(m)
varImpPlot(m)
# test
pred<-predict(m,newdata=testdata)
freq<-table(pred,testdata$group)
sum(diag(freq))/sum(freq
10.PCA
differ <-read.table("PCA1.txt",header=T,row.names=1)
group = read.table("group.txt", header=T)
differ.pr<- prcomp(differ,scale= TRUE)
summary(differ.pr,loading=T)
#----Standard deviation 标准差 其平方为方差=特征值
#----Proportion of Variance 方差贡献率
#----Cumulative Proportion 方差累计贡献率
#由结果显示 前n个主成分的累计贡献率已经很大 可以舍去另外的主成分 达到降维的目的
#画图
library("ggbiplot")
ggbiplot(differ.pr,obs.scale=1,var.scale=1,groups=group$group,ellipse=F,var.axes=F)
11.HEATMAP
setwd("C:\\Users\\dell-xps\\Desktop")
library('gplots')
biomarker<-read.table("overlap2.txt",header=T,row.names=1)
biomarker<-as.matrix(biomarker)
group<-read.table("group.txt",header=T,row.names=1)
color.map <- function(group) { if (group=="tumor") "navy" else "lightyellow" }
patientcolors <- unlist(lapply(group$group, color.map))
mydist = function(x){
dist(x,method = 'maximum')
}
myclust = function(y){
hclust(y,method='centroid')
}
#"euclidean", "maximum", "manhattan", "canberra", "binary" "minkowski"
#"ward.D", "ward.D2", "single", "complete", "average" "mcquitty" "median" "centroid"
heatmap.2(biomarker,scale="row",
distfun=mydist,
hclustfun = myclust,
col=colorRampPalette(c("blue","yellow2")),
symm = FALSE,
offsetRow=0,
offsetCol=1,
sepwidth=c(0.7,0.03),
key=T,keysize=0.78,trace="none",
density.info="none",
cexCol=1,cexRow=0.7,
ColSideColors=patientcolors,
main="hierarchical clustering of overlap miRs")
#labcol=F
12.ROC
library("pROC")
data<-read.table("ROC.txt",header=T,row.names=1)
ci629 <- ci(data$group,data[,1])
ci629
roc(data$group,data[,1],plot=T,col="red",print.auc=T,main="ROC of hsa-miR-629-5p")
ci584 <- ci(data$group,data[,2])
ci584
roc(data$group,data[,2],plot=T,col="red",print.auc=T,main="ROC of hsa-miR-584-5p")
ci345 <- ci(data$group,data[,3])
ci345
roc(data$group,data[,3],plot=T,col="red",print.auc=T,main="ROC of hsa-miR-345-5p")
ci7e <- ci(data$group,data[,4])
ci7e
roc(data$group,data[,4],plot=T,col="red",print.auc=T,main="ROC of hsa-let-7e-3p")
ci365 <- ci(data$group,data[,5])
ci365
roc(data$group,data[,5],plot=T,col="red",print.auc=T,main="ROC of hsa-miR-365b-3p")
ci1343 <- ci(data$group,data[,6])
ci1343
roc(data$group,data[,6],plot=T,col="red",print.auc=T,main="ROC of hsa-miR-1343-3p")
cicombination <- ci(data$group,data[,7])
cicombination
roc(data$group,data[,7],plot=T,col="red",print.auc=T,main="ROC of combination biomarkers")
plot.roc(data$group, data$hsa.miR.629.5p, main="ROC curve of biomarkers", percent=T, col="#efefef")
lines.roc(data$group, data$hsa.miR.584.5p, percent=T, col="#e9f3ea")
lines.roc(data$group, data$hsa.miR.345.5p, percent=T, col="#d4dee7")
lines.roc(data$group, data$hsa.let.7e.3p, percent=T, col="#f8f2e4")
lines.roc(data$group, data$hsa.miR.365b.3p, percent=T, col="#65a479")
lines.roc(data$group, data$hsa.miR.1343.3p, percent=T, col="#d3ba68")
lines.roc(data$group, data$combination, percent=T, col="#d5695d")
text(70,50,labels="AUC of combined biomarkers: 0.914",adj=c(0, .5))
legend("bottomright", legend=c("hsa-miR-629-5p", "hsa-miR-584-5p","hsa-miR-345-5p", "hsa-let-7e-3p",
"hsa-miR-365b-3p", "hsa-miR-1343-3p","combination"),
col=c("#efefef", "#e9f3ea","#d4dee7","#f8f2e4","#65a479","#d3ba68","#d5695d"), lwd=2)
13.boxplot
library(ggplot2)
library(RColorBrewer)
library(ggthemes)
ROC<-read.table("ROC.txt",header=T,row.names=1)
data = data.frame(group = ROC$group,Value = ROC$hsa.miR.629.5p)
data$group = factor(data$group,levels = c("normal","tumor"))
P1 <- ggplot(data,aes(x=group,y=Value,fill=group))+
stat_boxplot(geom = "errorbar",width=0.15,aes(color="black"))+
geom_boxplot(size=0.7,fill="white",outlier.fill="white",outlier.color="white")+
geom_jitter(aes(fill=group),width =0.2,shape = 23,size=2.5)+
scale_fill_manual(values = c("#F0E442", "#0072B2"))+
scale_color_manual(values=c("black", "black"))+
ggtitle("hsa-miR-629-5p")+
theme_bw()+ #背景变为白色
theme(legend.position="none",
axis.text.x=element_text(colour="black",family="Times",size=14), #设置x轴刻度标签的字体属性
axis.text.y=element_text(family="Times New Roman",size=14,face="plain"), #设置x轴刻度标签的字体属性
axis.title.y=element_text(family="Times New Roman",size = 14,face="plain"), #设置y轴的标题的字体属性
axis.title.x=element_text(family="Times New Roman",size = 14,face="plain"), #设置x轴的标题的字体属性
plot.title = element_text(family="Times New Roman",size=15,face="bold",hjust = 0.5), #设置总标题的字体属性
panel.grid.major = element_blank(), #不显示网格线
panel.grid.minor = element_blank())+
ylab("log2(RPM+1)")+xlab("group") #设置x轴和y轴的标题
P2<-P1+ annotate("text", x=1.5, y=8,label="p-value=0.000001",size=5)
P2
ggsave("box629.png", dpi=300)
14.cortest
cortxt <-read.table("cor.txt",header=T,row.names=1)
cor(x = cortxt[,"LE35"], y = cortxt[,"OV115"], use = "everything", method = "pearson")
cor(x = cortxt[,"LE35"], y = cortxt[,"OV115"], use = "everything", method = "spearman")
cor(x = cortxt[,"LE35"], y = cortxt[,"OV115"], use = "everything", method = "kendall")