寶藏:TCGA任意基因任意腫瘤,隨意分析(附數據和代碼)

當我們拿到一個基因,肯定是想知道它的各種基本信息,今天的教程可以做到

1 任意一個腫瘤在泛癌中的表達情況

2 任意一個基因在腫瘤和正常組織中的表達情況

3 任意一個基因在腫瘤不同分期,不同性別等臨床特性的表達情況

4 任意一個基因在任意一個腫瘤,或腫瘤的某種特征中的生存分析

5 這個數據能做到的太多,只要充分發揮想象力,所有的數據獲取方式見文末

首先我們從UCSC Xena數據框下載pancancer的標準化后的表達譜和臨床資料

放到我們的工作目錄上,解壓

讀取數據

這一步耗時較長(如果這個過程你的電腦hold不住了,可以直接用后面整理好的數據,開始作圖)

rm(

list

= ls())

library(tidyverse)

#讀取數據

ALLdata <- data.table::fread(

"tcga_RSEM_gene_tpm"

,data.table = F)

ALLdata[

1

:

5

,

1

:

5

]

讀取臨床信息

#讀取臨床信息

clin

<- data.table::fread(

"Survival_SupplementalTable_S1_20171025_xena_sp"

,data.table = F)

篩選有用的臨床資料,并將復雜的名字改成簡單的名字

clin1 <-clin %>%

select

(sample,

"cancer type abbreviation"

,gender,ajcc_pathologic_tumor_stage,OS,OS.time) %>%

#選取這些列

rename

(Type=

"cancer type abbreviation"

,Stage=ajcc_pathologic_tumor_stage,status=OS,

time

=OS.time)

讀取基因注釋文件(詳見 TCGA數據下載與ID轉換)

gtf <- rtracklayer::import(

"Homo_sapiens.GRCh38.99.chr.gtf.gz"

)

#轉換為數據框

gtf <-

as

.data.frame(gtf)

#保存

save(gtf,file =

"gtf.Rdata"

)

去掉基因名小數點及后面的數字方便下一步轉換

colnames(ALLdata)[

1

]<-

"gene_id"

ALLdata1<-separate(ALLdata,gene_id,

into

= c(

"gene_id"

),sep=

"\\."

)

ALLdata1[

1

:

5

,

1

:

5

]

提取編碼蛋白,與表達譜進行合并以轉換基因名。

(也可以取miRNA或者lncRNA等)

mRNA<-dplyr::filter(gtf,

type

==

"gene"

,gene_biotype==

"protein_coding"

)%>%

#選擇編碼蛋白

select(gene_name,gene_id,gene_biotype)%>%

#選擇有用的三列

inner_join(ALLdata1,by =

"gene_id"

)%>%

#與表達譜合并

select(-gene_id,-gene_biotype)%>%

distinct(gene_name,.keep_all=T)

mRNA[1:5,1:5]

將gene_name這一列作為行名

row

.names

(

mRNA

)<

-mRNA

[,1]

mRNA

<

-mRNA

[,-1]

mRNA

[1:5,1:5]

這個時候繼續下去可能電腦hold不住了,先保存下文件

save(mRNA,file =

"pancancer_mRNA.Rdata"

)

save(clin1,file =

"pancancer_clin.Rdata"

)

重新開始

rm(

list

= ls())

load(

"pancancer_mRNA.Rdata"

)

將表達譜倒置,方便后續與臨床資料的合并

mRNA

<

-as

.data

.frame

(

t

(

mRNA

))

mRNA

[1:5,1:5]

接下來是設計Normal和Cancer的分組,根據TCGA數據ID的特性可區分Normal和Cancer TCGA差異分析及ggplot作圖驗證

group_list=ifelse(

as

.numeric(substr(rownames(mRNA),

14

,

15

)) <

10

,

"tumor"

,

"normal"

)

design <- model.matrix(~

0

+factor(group_list))

colnames(design)=levels(factor(group_list))

rownames(design)=rownames(mRNA)

head(design)

這里我用了之前做差異分析時分組的代碼,應該可以寫成更簡潔的形式

給design數據數據變成數據框,添加一列ID

class

(design)

design<-

as

.

data

.frame(design)

design$ID<-row.names(design)

design[

1

:

3

,

1

:

3

]

將design分組中tumor這一列的1換成Tumor,0換成Normal

design

$tumor

[design

$tumor

==1] <-

"Tumor"

#1換成Tumor

design

$tumor

[design

$tumor

==0] <-

"Normal"

#0換成Normal

design<-design[,-1]

#去掉normal這一列

colnames(design)[1]<-

"Type"

#cancer這一列列名改為Type

head(design)

合并數據

mRNA$ID<-row.names(mRNA)

mRNA=inner_join(mRNA,design,by =

"ID"

,copy=T)

mRNA[1:5,1:5]

調整列的順序便于觀察

mRNA<-

select

(mRNA,ID,Type,everything())

mRNA[

1

:

5

,

1

:

5

]

加載臨床資料

load

(

"pancancer_clin.Rdata"

)

clin1[

1

:

5

,

1

:

5

]

為了合并,將列名sample換成ID,由于Type列名重復了,將Type換成Cancer

clin1<-rename(clin1,ID=sample,Cancer=Type)

clin1[1:5,1:5]

合并

drawdata<-dplyr::inner_join(mRNA,clin1,by=

"ID"

,copy=T)

drawdata[

1

:

5

,

1

:

5

]

調整列名便于觀察

drawdata<-

select

(drawdata,ID,Cancer,gender,Stage,status,

time

,everything())

drawdata[

1

:

5

,

1

:

10

]

保存數據

save(drawdata,file =

"pancancer_drawdata.Rdata"

)

重新加載數據(以后就可以直接加載這個數據畫圖了,但是別忘了加載包)

rm(

list

= ls())

load(

"pancancer_drawdata.Rdata"

)

drawdata[

1

:

5

,

1

:

10

]

看看每個腫瘤和正常組織的個數

table(

filter

(drawdata,Type==

"Tumor"

)

$Cancer

)

table(

filter

(drawdata,Type==

"Normal"

)

$Cancer

)

接下來就可以任意畫圖啦!

看一看單基因在泛癌中的表達

p

<

-ggplot(filter(drawdata,Type=="Tumor"),aes(x

=

Cancer,

y

=

SPP1,color

=

Cancer))+

geom_boxplot

()+

geom_jitter

()+

theme_bw

()+

theme

(

legend.position

=

"none"

)

p

加上正常組織

library(ggpubr)

p <- ggboxplot(drawdata, x =

"Cancer"

, y =

"SPP1"

,

fill =

"Type"

,legend=

F

,palette =

c

(

"#00AFBB"

,

"#E7B800"

))+

theme_bw()

p

單個腫瘤差異表達

p

<

-

ggboxplot

(

filter

(

drawdata

,

Cancer

==

"LIHC"

),

x

=

"Type"

,

y

=

"SPP1"

,

fill

=

"Type"

,

legend

=

F,palette

=

c(

"#

00AFBB

", "#

E7B800

"),

bxp.errorbar

=

T)+

theme

(

legend.position

=

"none"

)+

ylab

(

label

=

"SPP1 expression"

)

p

加上P值

my_comparisons <-

list

( c(

"Tumor"

,

"Normal"

) )

p + stat_compare_means(comparisons = my_comparisons)

腫瘤分期表達

先看一下分期的情況

table(drawdata

$Stage

)

只取Stage I II III IV期的數據,先整理分期情況

drawdata

$Stage

<-str_replace_all(drawdata

$Stage

, c(

"Stage IVC"

=

"Stage IV"

))

drawdata

$Stage

<-str_replace_all(drawdata

$Stage

, c(

"Stage IVB"

=

"Stage IV"

))

drawdata

$Stage

<-str_replace_all(drawdata

$Stage

, c(

"Stage IVA"

=

"Stage IV"

))

drawdata

$Stage

<-str_replace_all(drawdata

$Stage

, c(

"Stage IIIC"

=

"Stage III"

))

drawdata

$Stage

<-str_replace_all(drawdata

$Stage

, c(

"Stage IIIB"

=

"Stage III"

))

drawdata

$Stage

<-str_replace_all(drawdata

$Stage

, c(

"Stage IIIA"

=

"Stage III"

))

drawdata

$Stage

<-str_replace_all(drawdata

$Stage

, c(

"Stage IIC"

=

"Stage II"

))

drawdata

$Stage

<-str_replace_all(drawdata

$Stage

, c(

"Stage IIB"

=

"Stage II"

))

drawdata

$Stage

<-str_replace_all(drawdata

$Stage

, c(

"Stage IIA"

=

"Stage II"

))

drawdata

$Stage

<-str_replace_all(drawdata

$Stage

, c(

"Stage IB"

=

"Stage I"

))

drawdata

$Stage

<-str_replace_all(drawdata

$Stage

, c(

"Stage IA"

=

"Stage I"

))

table(drawdata

$Stage

)

過濾數據只留下Stage I II III IV期的數據

Stagedata<-filter(drawdata,Stage ==

"Stage I"

|Stage ==

"Stage II"

| Stage ==

"Stage III"

| Stage ==

"Stage IV"

)

table(Stagedata

$Stage

)

現在就可以畫任意腫瘤任意基因分期的表達了

p

<

-

ggboxplot

(

filter

(

Stagedata

,

Cancer

==

"LIHC"

),

x

=

"Stage"

,

y

=

"SPP1"

,

fill

=

"Stage"

,

legend

=

F,bxp.errorbar

=

T)+

theme

(

legend.position

=

"none"

)+

ylab

(

label

=

"SPP1 expression"

)

p

調整橫坐標順序

Stagedata

$Stage

<-factor(Stagedata

$Stage

,levels=c(

"Stage I"

,

"Stage II"

,

"Stage III"

,

"Stage IV"

))

p <- ggboxplot(

filter

(Stagedata,Cancer==

"LIHC"

), x =

"Stage"

, y =

"SPP1"

,

fill =

"Stage"

,legend=F,bxp.errorbar=T)+

theme(legend.position=

"none"

)+

ylab(label =

"SPP1 expression"

)

p

加上P值

my_comparisons <- list(

c

(

"Stage I"

,

"Stage II"

),

c

(

"Stage I"

,

"Stage III"

),

c

(

"Stage I"

,

"Stage IV"

),

c

(

"Stage II"

,

"Stage III"

),

c

(

"Stage II"

,

"Stage IV"

),

c

(

"Stage III"

,

"Stage IV"

) )

p + stat_compare_means(comparisons = my_comparisons,method =

"t.test"

)

還可以看不同性別,該基因的差異表達,這里就不再舉例子了,一般也用不到。

有了這個數據,我們還可以做任意基因在任意腫瘤中的生存分析

先加載包

library(survminer)

library(survival)

看看Status和time這兩列的情況

table(drawdata

$status

)

table(drawdata

$time

)

把隨訪時間為0的,和正常的樣本去掉

OSdata<-filter(drawdata,drawdata

$time

!=0 & drawdata

$Type

==

"Tumor"

)

table(OSdata

$time

)

OSdata[1:5,1:10]

計算肝癌中SPP1表達的中位數

LIHC<-filter(OSdata,Cancer==

"LIHC"

)

med.exp<-median(LIHC$

"SPP1"

)

根據中位數分成高表達和低表達兩組

more.med.exp.index<-

which

(LIHC$

"SPP1"

>=med.exp)

less.med.exp.index<-

which

(LIHC$

"SPP1"

<med.exp)

LIHC$

"SPP1"

[more.med.exp.index]<-paste0(

"High expression ("

,

length(more.med.exp.index),

")"

)

LIHC$

"SPP1"

[less.med.exp.index]<-paste0(

"Low expression ("

,

length(less.med.exp.index),

")"

)

LIHC.fit<-survfit(Surv(time, status) ~ SPP1, data = LIHC)

畫圖

LIHC.survival

<

-ggsurvplot(LIHC.fit,

data

=

LIHC,

risk.table

=

T,

risk.table.col

=

"strata"

,

risk.table.y.text

=

F,

risk.table.title

=

"Number at risk"

,

pval

=

TRUE,

#

conf.int

=

TRUE,

xlab

=

"Time (days)"

,

ggtheme

=

theme_light(),

surv.median.line

=

"hv"

,

title

=

"LIHC SPP1 Survival"

)

LIHC.survival

當然,我們還可以做任意一個分期,或者不同性別中任意基因的生存分析,操作方法都是一樣的。

其實這個數據還可以做很多分析,包括任意兩個基因在任意腫瘤中的相關性分析,以及在泛癌中的相關性分析等等。

免責聲明:本文僅代表文章作者的個人觀點,與本站無關。其原創性、真實性以及文中陳述文字和內容未經本站證實,請讀者僅作參考,并自行核實相關內容。如發現有害或侵權內容,請聯系郵箱:jubao@pinlue.com,我們將在第一時間進行核實處理。

http://image95.pinlue.com/image/75.jpg
分享
評論
首頁
暖暖高清在线观看视频桃花社区视频在线观看播放