WES分析5-比对

汪安宁

2023-12-01

比对

cd /home/lzn/WES/genome
#参考基因组位于该目录下
mkdir index

#hisat2 建立index
hisat2-build -p 8 -q hg38.fa.gz ./index/hg38 &
#使用软件建立index使用时间较长，还是从hisat2官网下载已经建立好的index

wget https://genome-idx.s3.amazonaws.com/hisat/grch38_snptran.tar.gz
#数据存储在Amazon Web Services上，用梯子下载

tar -zxvf grch38_snptran.tar.gz

cd /home/lzn/WES/cleandata
ln -s ../rawdata/*.fastq ./
mkdir ../align

#因为分析的目的是检测出SNP和indel，所以在比对的时候要允许一定范围的错配，hisat2没有设置允许错配的参数，因此这一步需要换一个比对软件。
#浏览别人的分析流程，都用BWA比对(GATK官网提供的最佳分析流程里使用的是BWA)，但没有设置允许错配数，既然是学习，就按最佳分析流程走，使用BWA比对。

bwa index -a bwtsw -p resources_broad_hg38_v0_Homo_sapiens_assembly38.fasta.64 \ resources_broad_hg38_v0_Homo_sapiens_assembly38.fasta
#将index转移至index文件夹下
#index文件可从网站直接下载


#mappig
export wkd=/home/lzn/WES
INDEX=$wkd/genome/index/resources_broad_hg38_v0_Homo_sapiens_assembly38.fasta.64
touch $wkd/run.log
for sample in `cat $wkd/sample.list`
do
bwa mem -t 8 -M -Y -R "@RG\tID:$sample\tSM:$sample\tLB:WES\tPL:Illumina"  $INDEX\ 
#linux下双引号可以包含特殊字符
$wkd/cleandata/$sample\_1.fastq $wkd/cleandata/$sample\_2.fastq |samtools view \
-Sb -@ 8 - >$wkd/align/$sample.bam 
echo "$sample mapping step is done" >> $wkd/run.log
done

#sortbam
#picard=~/picard/picard.jar 已经写入.bashrc文件中
for sample in `cat $wkd/sample.list`
do
java -jar $picard SortSam \
-INPUT $wkd/align/$sample.bam \
-OUTPUT $wkd/align/$sample.sorted.bam \
-SORT_ORDER coordinate	#按坐标排序，等同于samtools sort -p
echo "$sample sortsam step is done" >> $wkd/run.log
done
#picard不能设置多线程，使用for循环逐个处理比较慢。可以多个命令并行运行。
cat $wkd/sample.list |awk '{print "java -jar $picard SortSam -INPUT $wkd/align/"$1".bam -OUTPUT $wkd/align/"$1".sorted.bam -SORT_ORDER coordinate &"}' >sort.sh
bash sort.sh &

#MarkDuplicates
for sample in `cat $wkd/sample.list`
do
java -jar $picard MarkDuplicates \
-REMOVE_DUPLICATES true \
-INPUT $wkd/align/$sample.sorted.bam \
-OUTPUT $wkd/align/$sample.sorted.dedup.bam \
-METRICS_FILE $wkd/align/$sample.metrics.txt
echo "$sample MarkDuplicates step is done" >> $wkd/run.log
done
cat $wkd/sample.list |awk '{print "java -jar $picard MarkDuplicates -INPUT  $wkd/align/"$1".sorted.bam -OUTPUT $wkd/align/"$1".sorted.dedup.bam -METRICS_FILE  $wkd/align/"$1".metrics.txt &"}'  > dedup.sh
bash dedup.sh &
# MarkDuplicatesSpark performs both the duplicate marking step and the sort step for this stage of pre-processing.
#MarkDuplicates and SortSam are currently implemented as single threaded tools and consequently cannot take advantage of core parallelism. It is advised to run MarkDuplicatesSpark over MarkDuplicates followed by SortSam on machines with a high number of cores or on machines with access to fast disk drives.
#MarkDuplicatesSpark https://gatk.broadinstitute.org/hc/en-us/articles/360047216351-MarkDuplicatesSpark

#Index bam
for sample in `cat $wkd/sample.list`
do
java -jar $picard BuildBamIndex \
-INPUT $wkd/align/$sample.sorted.dedup.bam 
echo "$sample index bam step is done" >> $wkd/run.log
done
cat $wkd/sample.list |awk '{print "java -jar $picard BuildBamIndex -INPUT  $wkd/align/"$1".sorted.dedup.bam &"}' > index.sh
bash index.sh

使用GATK中的Spark功能（以后在测试）

#mappig
export wkd=/home/lzn/WES
INDEX=$wkd/genome/index/resources_broad_hg38_v0_Homo_sapiens_assembly38.fasta.64
touch $wkd/run.log
for sample in `cat $wkd/sample.list`
do
bwa mem -t 8 -R "@RG\tID:$sample\tSM:$sample\tLB:WES\tPL:Illumina"  $INDEX \
#linux下双引号可以包含特殊字符
$wkd/cleandata/$sample\_1.fastq $wkd/cleandata/$sample\_2.fastq |samtools view \
-Sb -@ 8 - >$wkd/align/$sample.bam && \
echo "$sample mapping step is done" >> $wkd/run.log
done

#MarkDuplicatesSpark
for sample in `cat $wkd/sample.list`
do
java -jar $gatk MarkDuplicatesSpark \
--remove-all-duplicates true \
-I $wkd/align/$sample.sorted.bam \
-O $wkd/align/$sample.sorted.dedup.bam \
-M $wkd/align/$sample.metrics.txt && \
echo "$sample MarkDuplicates step is done" >> $wkd/run.log
done
#MarkDuplicatesSpark performs both the duplicate marking step and the sort step for this stage of pre-processing.As an alternative to MarkDuplicatesSpark this step can be performed by using the Picard implementation of MarkDuplicates to perform the duplicate marking followed by SortSam to sort the reads.
#写脚本注意每行末尾不要有多余的空格

WES分析5-比对

比对

相关阅读

相关文章

相关问答

相关文档