GDC数据传输工具旨在与GDC数据门户和GDC数据提交门户结合使用,以向GDC传输数据或从GDC传输。首先,GDC数据门户的接口用于生成清单文件或获取UUID和(对于受控访问数据)身份验证令牌。然后,GDC数据传输工具用于传输清单文件中列出的或由UUID标识的数据文件。
https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload/
# 直接下载二进制
https://gdc.cancer.gov/access-data/gdc-data-transfer-tool
# 下载源码安装
https://github.com/NCI-GDC/gdc-client#install-pre-commit
git clone https://github.com/NCI-GDC/gdc-client.git
# 帮助文档
gdc-client download --help
# 配置文件
gdc-client settings download --config my-dtt-config.dtt
../software/gdc-client download -m gdc_manifest.2022-10-08.txt
基因表达文件 ( *.tsv)列名为:gene_id gene_name gene_type unstranded stranded_first stranded_second tpm_unstranded fpkm_unstranded fpkm_uq_unstranded
# 输入UUIDs号,下载多个
../software/gdc-client download c1751d2c-4e1f-429b-93a0-d65013bf90a2
# 输入多个UUIDs号,空格隔开,下载多个
../software/gdc-client download 3f40030e-08ea-48ff-9352-6ee054b64a47 1a0c3f54-a7ac-446a-98bd-169414496ac6
注: 表达谱文件在单独的文件夹中,UUIDs号为文件夹名称,没有TCGA中样品编号,不是表达矩阵
### 合并从GDC下载的基因表达数据成表达矩阵,行为基因symbol,列为样品名
# '@ para: map_df 含uuids和样品名的数据框
# '@ para: base_dir GDC下载的文件目录,
# 下面含uuids命名的子目录,
# 每个子目录下含表达数据的tsv文件
# '@ para: gene_type: 从lncRNA,miRNA,protein_coding 等中选择
# '@ return 基因表达矩阵,行:基因symbol,列:样品名
get_gdc_exprs_df <- function(map_df,base_dir,gene_type='protein_coding'){
exprs_df <- data.frame()
uuids <- list.files(base_dir)
# 确定uuids在数据框中(有对应的sample.submitter_id)
hit_uuids <- intersect(uuids,map_df$id)
# 逐一读数据,生成表达矩阵
for (uuid in hit_uuids){
sample_id <- ids_mapping_df[which(ids_mapping_df$id==uuid),'sample.submitter_id']
m_dir <- paste(base_dir,uuid,sep='/')
# 表达谱文件以".tsv"结尾
index <- grep(".tsv",list.files(m_dir))
file_name <- list.files(m_dir)[index]
m_file <- paste(m_dir,file_name,sep='/')
# 数据有固定的格式
m_df <- read.table(m_file,header = FALSE,skip = 6)
# m_df[1:3,]
## header
#gene_id gene_name gene_type unstranded stranded_first
#stranded_second tpm_unstranded fpkm_unstranded fpkm_uq_unstranded
colnames(m_df) <- c("gene_id","gene_name","gene_type","unstranded",
"stranded_first","stranded_second","tpm_unstranded",
"fpkm_unstranded","fpkm_uq_unstranded")
# 选择蛋白质编码基因
m_df <- m_df[which(m_df$gene_type==gene_type),]
# dim(m_df)
# 取基因名和count数(正负链count数之和)
m_df <- m_df[,c("gene_name","unstranded")]
# 取基因最多表达的转录本数据
m_df <- dplyr::group_by(m_df,gene_name) %>%
summarise(counts_max = max(unstranded,na.rm = TRUE))
m_df <- as.data.frame(m_df)
# 修改行列名为样品名
colnames(m_df) <- c("gene_name",sample_id)
if(nrow(exprs_df) == 0){
exprs_df <- m_df
}else if(rownames(exprs_df)==rownames(m_df)){
exprs_df[,sample_id] <- m_df[,sample_id]
}
}
# 修改行名为基因名
rownames(exprs_df) <- exprs_df$gene_name
# 得到最终的基因表达谱
exprs_df<- exprs_df[,-1]
return (exprs_df)
}
library(TCGAbiolinks)
library(dplyr)
library(DT)
### TCGAbiolinks包下载SampleSheet
# getGDCprojects()$project_id
project <- 'TCGA-COAD'
#project <- 'TCGA-READ'
query <- GDCquery(project = project,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts")
# GDCdownload(query, files.per.chunk = 100)
# colnames(query$results[[1]])
## uuids和tcga样品ids的对应数据框
ids_mapping_df <- query$results[[1]][,c('id','sample.submitter_id')]
head(ids_mapping_df)
### 列出下的数据的uuids
base_dir <- paste0(project_dir,"/GDCdata/")
base_dir <- paste0(base_dir,project)
base_dir <- paste0(base_dir,"/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification")
uuids <- list.files(base_dir)
# 确定uuids在数据框中(有对应的sample.submitter_id)
hit_uuids <- intersect(uuids,ids_mapping_df$id)
### 读mRNA表达谱文件,合并为表达矩阵
exprs_df <- get_gdc_exprs_df(ids_mapping_df,base_dir,gene_type='protein_coding')
# exprs_df[1:3,1:5]
# dim(exprs_df)
# 写入文件
write.csv(exprs_df,paste0(project,"-counts.csv"))
https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Custom_Set_Analysis/