import scanpy as sc
import anndata as ad
import numpy as np0
import torch
import torch.nn as nn
import torch.optim as optim
# 1. 讀取原始檔案
AData = sc.read_h5ad("6ovaries_analysis_results.h5ad")
adata = sc.read_h5ad("6ovaries_analysis_results.h5ad")
# 2. 檢查目前結構
# 如果讀進來已經是 6 x 20899,就不要執行 .T
# 如果讀進來是 20899 x 6,才執行一次 .T 把它轉正
if adata.n_obs > adata.n_vars:
adata = adata
print(f"數據維度: {adata.n_obs} 樣本, {adata.n_vars} 基因")
print("目前的 obs 欄位:", adata.obs.columns)
# 標準化與對數轉換 (深度學習對數值範圍敏感)
sc.pp.log1p(adata)
sc.pp.scale(adata, max_value=10) # 建議增加 scale 步驟
# 轉換為 Tensor
X = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X
X_tensor = torch.FloatTensor(X)
# 2. 定義自編碼器模型
class Autoencoder(nn.Module):
def __init__(self, input_dim, latent_dim=2):
super(Autoencoder, self).__init__()
# 編碼器:降低維度
self.encoder = nn.Sequential(
nn.Linear(input_dim, 128),
nn.ReLU(),
nn.Linear(128, latent_dim),
)
# 解碼器:重建原始維度
self.decoder = nn.Sequential(
nn.Linear(latent_dim, 128),
nn.ReLU(),
nn.Linear(128, input_dim),
)
def forward(self, x):
latent = self.encoder(x)
reconstructed = self.decoder(latent)
return reconstructed, latent
# 3. 初始化模型、損失函數與優化器
input_dim = X.shape[1]
latent_dim = 1000 # 只有 6 個樣本,建議將隱藏層設小 (如 2),方便可視化
model = Autoencoder(input_dim, latent_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
# 4. 訓練模型
epochs = 1000
model.train()
for epoch in range(epochs):
# 前向傳播
reconstructed, latent = model(X_tensor)
loss = criterion(reconstructed, X_tensor)
# 反向傳播與優化
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch + 1) % 20 == 0:
print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
# 5. 提取隱藏層特徵 (Latent Space)
model.eval()
with torch.no_grad():
_, latent_features = model(X_tensor)
# 將訓練好的特徵存回 AnnData 的 obsm 中,方便 Scanpy 繪圖
adata.obsm['X_ae'] = latent_features.numpy()
# 6. 可視化
sc.pl.scatter(adata, basis='ae', color='treatment', title="Autoencoder Latent Space")
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
# 1. 提取編碼器第一層的權重 (Shape: 64 x 20899)
# model.encoder[0] 對應 nn.Linear(input_dim, 64)
weights = model.encoder[0].weight.detach().numpy()
# 2. 計算每個基因(column)的綜合影響力得分
# 我們取絕對值的總和,因為正向和負向權重都代表該基因具有資訊量
gene_importance = np.sum(np.abs(weights), axis=0)
# 3. 建立 DataFrame 並與基因名稱對應
importance_df = pd.DataFrame({
'gene_symbol': adata.var_names,
'importance_score': gene_importance
})
print("-^-" * 30)
# --- 1. 計算統計基準 ---
mean_score = gene_importance.mean()
std_score = gene_importance.std()
suggested_cutoff = mean_score + 2 * std_score
print(f"\n[統計資訊]")
print(f"平均得分 (Mean): {mean_score:.4f}")
print(f"標準差 (Std): {std_score:.4f}")
print(f"建議門檻值 (Mean + 2SD): {suggested_cutoff:.4f}")
print("-" * 30)
# --- 2. 暫停並等待輸入 ---
user_input = input(f"請輸入您想要的 Cutoff 門檻值 (直接按 Enter 則使用建議值 {suggested_cutoff:.4f}): ")
# 判斷輸入邏輯
try:
if user_input.strip() == "":
cutoff = suggested_cutoff
else:
cutoff = float(user_input)
except ValueError:
print("輸入格式錯誤,將自動使用建議門檻值。")
cutoff = suggested_cutoff
print(f"\n確認使用 Cutoff = {cutoff:.4f} 進行篩選...")
plt.figure(figsize=(8, 4))
sns.histplot(importance_df['importance_score'], bins=100, kde=True)
plt.axvline(cutoff, color='red', linestyle='--', label='Mean + 2SD')
plt.title("Distribution of Gene Importance Scores")
plt.legend()
plt.show()
# --- 3. 執行篩選 ---
# 標記是否顯著
importance_df['is_significant'] = importance_df['importance_score'] > cutoff
# 挑選出符合條件的基因並排序
top_genes_df = importance_df[importance_df['is_significant']].sort_values(
by='importance_score', ascending=False
)
# --- 4. 顯示結果 ---
num_sig = len(top_genes_df)
print(f"篩選完成!共找到 {num_sig} 個顯著基因。")
print('============================'*2)
print("--- Autoencoder 識別出的最具影響力基因 --- 基因數目 =", len(top_genes_df))
print(top_genes_df)
import matplotlib.pyplot as plt
# 5. 可視化:將這些重要基因的得分畫出來
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.barplot(data=top_genes_df, x='importance_score', y='gene_symbol', palette='viridis')
plt.title(f"Influential Genes identified by AE, n = {num_sig}")
plt.xlabel("Cumulative Absolute Weights")
plt.show()
results_dict = {}
for i in range(1, 3): # 跑兩次
print(f"\n>>> 開始第 {i} 輪訓練...")
# 這裡放入你原本的代碼:定義模型、訓練、計算 importance...
# (為了簡潔,假設你已將 AE 訓練邏輯包裝成一個函數或直接放在這裡)
# 訓練與篩選邏輯結束後:
current_genes = set(importance_df[importance_df['importance_score'] > cutoff]['gene_symbol'])
results_dict[f'run_{i}'] = current_genes
print(f"第 {i} 輪完成,篩選出 {len(current_genes)} 個基因")
# 最後提取出來
genes_run1 = results_dict['run_1']
genes_run2 = results_dict['run_2']
# 假設第一次篩選出的基因清單為 genes_run1,第二次為 genes_run2
# 這裡根據你提供的 top_genes_df 邏輯提取基因名稱
# 1. 計算交集 (Intersection)
stable_genes = genes_run1.intersection(genes_run2)
# 2. 計算聯集 (Union)
all_detected_genes = genes_run1.union(genes_run2)
# 3. 計算重合率 (Jaccard Index)
jaccard_index = len(stable_genes) / len(all_detected_genes)
print("\n" + "="*40)
print("基因穩定性分析結果")
print("="*40)
print(f"第一次運行基因數: {len(genes_run1)}")
print(f"第二次運行基因數: {len(genes_run2)}")
print(f"真正穩定的基因數 (交集): {len(stable_genes)}")
print(f"重合率 (Jaccard Index): {jaccard_index:.2%}")
# 4. 顯示前 10 個最穩定的基因
print("\n前 10 個穩定基因範例:")
print(list(stable_genes)[:10])
# 5. 存檔:將穩定基因導出
pd.DataFrame({'stable_genes': list(stable_genes)}).to_csv("stable_genes_intersection.csv", index=False)
print("\n已將穩定基因清單存至: stable_genes_intersection.csv")
import pandas as pd
import gseapy as gp
import networkx as nx
import matplotlib.pyplot as plt
from pathlib import Path
# --- 1. 路徑設定 ---
desktop = Path.home() / "Desktop"
stable_csv = Path("stable_genes_intersection.csv") # AE 產出的檔案
excel_source = desktop / "Aja 21649_LeidCom_clustered.xlsx"
output_dir = desktop / "Autoncoder_Enrichment_Results"
output_dir.mkdir(exist_ok=True)
# --- 2. 數據讀取與映射 (Mapping) ---
print("🔍 正在進行基因映射...")
try:
# 讀取 AE 篩選出的穩定基因 (假設欄位名稱為 stable_genes)
ae_genes = pd.read_csv(stable_csv)['stable_genes'].tolist()
# 讀取原始含有註解的 Excel
df_ref = pd.read_excel(excel_source)
# 進行映射:在 Excel 中找尋 Transcriptid 存在於 ae_genes 清單中的行
# 注意:請確保兩者的名稱格式一致
mapped_df = df_ref[df_ref['Transcriptid'].isin(ae_genes)].copy()
# 提取 Gene_symbol 用於富集分析 (去除空值與重複)
gene_list = mapped_df['Gene_symbol'].replace('#N/A', pd.NA).dropna().unique().tolist()
print(f"✅ 映射完成!AE 篩選 {len(ae_genes)} 個 ID,成功對應到 {len(gene_list)} 個有效基因。")
except Exception as e:
print(f"❌ 讀取或映射失敗: {e}")
exit()
# --- 3. 基因富集分析 (Enrichr) ---
print("🚀 執行 KEGG 富集分析...")
try:
enr = gp.enrichr(gene_list=gene_list,
gene_sets=['KEGG_2021_Human'], # 或選取其他適合的物種資料庫
organism='Human',
cutoff=0.5)
res = enr.results
# 儲存完整富集結果
res.to_excel(output_dir / "AE_Stable_Genes_Enrichment.xlsx", index=False)
# 篩選出顯著的通路 (Adj P < 0.05) 作為繪圖對象
top_paths = res[res['Adjusted P-value'] < 0.1].head(10) # 取前 10 名
if top_paths.empty:
print("⚠️ 無顯著富集通路,調整門檻取前 5 名展示...")
top_paths = res.nsmallest(10, 'Adjusted P-value')
except Exception as e:
print(f"❌ 富集分析失敗: {e}")
exit()
# --- 4. 繪製 Cnetplot (Gene-Concept Network) ---
print("🎨 正在生成 Cnetplot...")
try:
G = nx.Graph()
pathway_label_map = {}
for _, row in top_paths.iterrows():
pathway = row['Term']
p_val = row['Adjusted P-value']
pathway_label_map[pathway] = f"{pathway}\n(p={p_val:.2e})"
# 獲取該通路中的基因並與我們的清單取交集
path_genes = str(row['Genes']).split(';')
valid_genes = [g for g in path_genes if g in gene_list]
# 加入通路節點
G.add_node(pathway, node_type='pathway', size=len(valid_genes))
# 加入基因節點與邊 (限制顯示數量避免畫面太亂)
for gene in valid_genes[:15]:
G.add_node(gene, node_type='gene')
G.add_edge(pathway, gene)
# 繪圖佈局
plt.figure(figsize=(15, 12))
pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)
# 分別繪製通路與基因
path_nodes = [n for n, a in G.nodes(data=True) if a.get('node_type') == 'pathway']
gene_nodes = [n for n, a in G.nodes(data=True) if a.get('node_type') == 'gene']
# 繪製邊
nx.draw_networkx_edges(G, pos, edge_color='#dfe6e9', alpha=0.5)
# 繪製 Pathway 節點 (紅圈)
sizes = [G.nodes[n]['size'] * 500 for n in path_nodes]
nx.draw_networkx_nodes(G, pos, nodelist=path_nodes, node_color='#ff7675',
node_size=sizes, edgecolors='black', label='Pathways')
# 繪製 Gene 節點 (藍圈)
nx.draw_networkx_nodes(G, pos, nodelist=gene_nodes, node_color='#74b9ff',
node_size=150, alpha=0.8, label='Genes')
# 加入標籤 (Pathway)
path_labels = {n: pathway_label_map[n] for n in path_nodes}
nx.draw_networkx_labels(G, pos, labels=path_labels, font_size=10, font_weight='bold')
# 加入標籤 (Gene)
gene_labels = {n: n for n in gene_nodes}
nx.draw_networkx_labels(G, pos, labels=gene_labels, font_size=8)
plt.title("Cnetplot: AE Stable Genes & Top KEGG Pathways", fontsize=15)
plt.axis('off')
# 存檔
plt.savefig(output_dir / "AE_Stable_Cnetplot.png", dpi=300, bbox_inches='tight')
plt.show()
print(f"✅ 分析完成!結果已存至: {output_dir}")
except Exception as e:
print(f"❌ 繪圖失敗: {e}")
import pandas as pd
import gseapy as gp
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import traceback
import sys # 修正 exit() 問題
# --- 1. 路徑與參數設定 ---
desktop = Path.home() / "Desktop"
stable_csv = Path("stable_genes_intersection.csv")
excel_source = desktop / "Aja 21649_LeidCom_clustered.xlsx"
output_dir = desktop / "AE_Stability_Network_Analysis"
output_dir.mkdir(exist_ok=True)
# 你的目標數值欄位
value_col = 'avg_diff'
# --- 2. 數據映射與預處理 ---
print("🔍 正在進行基因數據映射...")
try:
ae_ids = pd.read_csv(stable_csv)['stable_genes'].tolist()
df_ref = pd.read_excel(excel_source)
# --- 自動欄位檢查機制 ---
if value_col not in df_ref.columns:
print(f"⚠️ 找不到欄位 '{value_col}'。")
print(f"現有欄位清單: {df_ref.columns.tolist()}")
# 嘗試找尋備用欄位
alternatives = ['log2FoldChange', 'logFC', 'p_val_adj', 'cluster']
found_alt = next((alt for alt in alternatives if alt in df_ref.columns), None)
if found_alt:
value_col = found_alt
print(f"💡 自動切換至備用欄位: '{value_col}'")
else:
print("❗ 找不到任何數值欄位,將建立預設數值 (1.0) 以維持繪圖。")
df_ref['default_value'] = 1.0
value_col = 'default_value'
mapped_df = df_ref[df_ref['Transcriptid'].isin(ae_ids)].copy()
if mapped_df.empty:
print("❌ 映射後沒有任何基因!請檢查 Transcriptid 是否匹配。")
sys.exit()
# 建立對照字典
gene_value_map = mapped_df.groupby('Gene_symbol')[value_col].mean().to_dict()
gene_list = list(gene_value_map.keys())
print(f"✅ 映射完成:{len(gene_list)} 個獨特基因符號。")
except Exception as e:
print(f"❌ 映射出錯: {e}")
traceback.print_exc()
sys.exit() # 修正後的安全退出
# --- 3. 基因富集分析與長條圖繪製 ---
print("🚀 執行 KEGG 富集分析...")
try:
enr = gp.enrichr(gene_list=gene_list,
gene_sets=['KEGG_2021_Human'],
organism='Human',
cutoff=0.5)
res = enr.results
if res.empty or 'Adjusted P-value' not in res.columns:
print("⚠️ 富集分析沒有結果,可能基因清單太少或不匹配。")
else:
res['-log10_Adj_P'] = -np.log10(res['Adjusted P-value'].replace(0, 1e-10))
# 繪製 Pathway Bar Plot
# 修正:確保 n 是整數
n_top = 20
top_10 = res.nsmallest(n_top, 'Adjusted P-value').iloc[::-1]
plt.figure(figsize=(10, 8)) # 稍微增加高度以容納 20 條
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(top_10)))
bars = plt.barh(top_10['Term'], top_10['-log10_Adj_P'], color=colors)
# 加入顯著性參考線 (P < 0.05 -> -log10(P) > 1.3)
# plt.axvline(x=-np.log10(0.1), color='red', linestyle='--', alpha=0.5, label='P=0.1')
plt.xlabel('-log10(Adjusted P-value)', fontsize=12)
plt.ylabel('Pathway Term', fontsize=12)
plt.title('Top Enriched KEGG Pathways (AE Stable Genes)', fontsize=14)
plt.legend()
plt.tight_layout()
plt.savefig(output_dir / "Pathway_Enrichment_Barplot.png", dpi=300)
plt.show()
# --- 4. 繪製純基因網路圖 (Network Graph) ---
print("🎨 生成穩定基因網路圖...")
G = nx.Graph()
top_paths = res.nsmallest(5, 'Adjusted P-value')
for _, row in top_paths.iterrows():
path_genes = [g for g in str(row['Genes']).split(';') if g in gene_list]
for i in range(len(path_genes)):
for j in range(i + 1, len(path_genes)):
G.add_edge(path_genes[i], path_genes[j])
if len(G.nodes) == 0:
print("⚠️ 基因間無共同通路聯繫,跳過網路圖製作。")
else:
plt.figure(figsize=(12, 12))
pos = nx.kamada_kawai_layout(G)
node_colors = [gene_value_map.get(n, 0) for n in G.nodes()]
nx.draw_networkx_edges(G, pos, alpha=0.2, edge_color='gray')
nodes = nx.draw_networkx_nodes(G, pos, node_size=500, node_color=node_colors,
cmap=plt.cm.RdBu_r, edgecolors='black', linewidths=0.5)
nx.draw_networkx_labels(G, pos, font_size=8, font_weight='bold')
plt.colorbar(nodes, label=f'Value ({value_col})', shrink=0.7)
plt.title("Gene Interaction Network")
plt.axis('off')
plt.savefig(output_dir / "Gene_Interaction_Network.png", dpi=300)
plt.show()
print(f"✨ 分析完成!結果已儲存至: {output_dir}")
except Exception as e:
traceback.print_exc()
No comments:
Post a Comment