L'anguille et le chat (鰻與貓): Python coding of AUTOENCODER on Japanese eel ovarian transcriptome features-selection

import scanpy as sc

import anndata as ad

import numpy as np0

import torch

import torch.nn as nn

import torch.optim as optim

# 1. 讀取原始檔案

AData = sc.read_h5ad("6ovaries_analysis_results.h5ad")

adata = sc.read_h5ad("6ovaries_analysis_results.h5ad")

# 2. 檢查目前結構

# 如果讀進來已經是 6 x 20899，就不要執行 .T

# 如果讀進來是 20899 x 6，才執行一次 .T 把它轉正

if adata.n_obs > adata.n_vars:

adata = adata

print(f"數據維度: {adata.n_obs} 樣本, {adata.n_vars} 基因")

print("目前的 obs 欄位：", adata.obs.columns)

# 標準化與對數轉換 (深度學習對數值範圍敏感)

sc.pp.log1p(adata)

sc.pp.scale(adata, max_value=10) # 建議增加 scale 步驟

# 轉換為 Tensor

X = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X

X_tensor = torch.FloatTensor(X)

# 2. 定義自編碼器模型

class Autoencoder(nn.Module):

def __init__(self, input_dim, latent_dim=2):

super(Autoencoder, self).__init__()

# 編碼器：降低維度

self.encoder = nn.Sequential(

nn.Linear(input_dim, 128),

nn.ReLU(),

nn.Linear(128, latent_dim),

)

# 解碼器：重建原始維度

self.decoder = nn.Sequential(

nn.Linear(latent_dim, 128),

nn.ReLU(),

nn.Linear(128, input_dim),

)

def forward(self, x):

latent = self.encoder(x)

reconstructed = self.decoder(latent)

return reconstructed, latent

# 3. 初始化模型、損失函數與優化器

input_dim = X.shape[1]

latent_dim = 1000 # 只有 6 個樣本，建議將隱藏層設小 (如 2)，方便可視化

model = Autoencoder(input_dim, latent_dim)

criterion = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 4. 訓練模型

epochs = 1000

model.train()

for epoch in range(epochs):

# 前向傳播

reconstructed, latent = model(X_tensor)

loss = criterion(reconstructed, X_tensor)

# 反向傳播與優化

optimizer.zero_grad()

loss.backward()

optimizer.step()

if (epoch + 1) % 20 == 0:

print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

# 5. 提取隱藏層特徵 (Latent Space)

model.eval()

with torch.no_grad():

_, latent_features = model(X_tensor)

# 將訓練好的特徵存回 AnnData 的 obsm 中，方便 Scanpy 繪圖

adata.obsm['X_ae'] = latent_features.numpy()

# 6. 可視化

sc.pl.scatter(adata, basis='ae', color='treatment', title="Autoencoder Latent Space")

import matplotlib.pyplot as plt

import pandas as pd

import numpy as np

import seaborn as sns

# 1. 提取編碼器第一層的權重 (Shape: 64 x 20899)

# model.encoder[0] 對應 nn.Linear(input_dim, 64)

weights = model.encoder[0].weight.detach().numpy()

# 2. 計算每個基因（column）的綜合影響力得分

# 我們取絕對值的總和，因為正向和負向權重都代表該基因具有資訊量

gene_importance = np.sum(np.abs(weights), axis=0)

# 3. 建立 DataFrame 並與基因名稱對應

importance_df = pd.DataFrame({

'gene_symbol': adata.var_names,

'importance_score': gene_importance

})

print("-^-" * 30)

# --- 1. 計算統計基準 ---

mean_score = gene_importance.mean()

std_score = gene_importance.std()

suggested_cutoff = mean_score + 2 * std_score

print(f"\n[統計資訊]")

print(f"平均得分 (Mean): {mean_score:.4f}")

print(f"標準差 (Std): {std_score:.4f}")

print(f"建議門檻值 (Mean + 2SD): {suggested_cutoff:.4f}")

print("-" * 30)

# --- 2. 暫停並等待輸入 ---

user_input = input(f"請輸入您想要的 Cutoff 門檻值 (直接按 Enter 則使用建議值 {suggested_cutoff:.4f}): ")

# 判斷輸入邏輯

try:

if user_input.strip() == "":

cutoff = suggested_cutoff

else:

cutoff = float(user_input)

except ValueError:

print("輸入格式錯誤，將自動使用建議門檻值。")

cutoff = suggested_cutoff

print(f"\n確認使用 Cutoff = {cutoff:.4f} 進行篩選...")

plt.figure(figsize=(8, 4))

sns.histplot(importance_df['importance_score'], bins=100, kde=True)

plt.axvline(cutoff, color='red', linestyle='--', label='Mean + 2SD')

plt.title("Distribution of Gene Importance Scores")

plt.legend()

plt.show()

# --- 3. 執行篩選 ---

# 標記是否顯著

importance_df['is_significant'] = importance_df['importance_score'] > cutoff

# 挑選出符合條件的基因並排序

top_genes_df = importance_df[importance_df['is_significant']].sort_values(

by='importance_score', ascending=False

)

# --- 4. 顯示結果 ---

num_sig = len(top_genes_df)

print(f"篩選完成！共找到 {num_sig} 個顯著基因。")

print('============================'*2)

print("--- Autoencoder 識別出的最具影響力基因 --- 基因數目＝", len(top_genes_df))

print(top_genes_df)

import matplotlib.pyplot as plt

# 5. 可視化：將這些重要基因的得分畫出來

import seaborn as sns

plt.figure(figsize=(10, 6))

sns.barplot(data=top_genes_df, x='importance_score', y='gene_symbol', palette='viridis')

plt.title(f"Influential Genes identified by AE, n = {num_sig}")

plt.xlabel("Cumulative Absolute Weights")

plt.show()

results_dict = {}

for i in range(1, 3): # 跑兩次

print(f"\n>>> 開始第 {i} 輪訓練...")

# 這裡放入你原本的代碼：定義模型、訓練、計算 importance...

# (為了簡潔，假設你已將 AE 訓練邏輯包裝成一個函數或直接放在這裡)

# 訓練與篩選邏輯結束後：

current_genes = set(importance_df[importance_df['importance_score'] > cutoff]['gene_symbol'])

results_dict[f'run_{i}'] = current_genes

print(f"第 {i} 輪完成，篩選出 {len(current_genes)} 個基因")

# 最後提取出來

genes_run1 = results_dict['run_1']

genes_run2 = results_dict['run_2']

# 假設第一次篩選出的基因清單為 genes_run1，第二次為 genes_run2

# 這裡根據你提供的 top_genes_df 邏輯提取基因名稱

# 1. 計算交集 (Intersection)

stable_genes = genes_run1.intersection(genes_run2)

# 2. 計算聯集 (Union)

all_detected_genes = genes_run1.union(genes_run2)

# 3. 計算重合率 (Jaccard Index)

jaccard_index = len(stable_genes) / len(all_detected_genes)

print("\n" + "="*40)

print("基因穩定性分析結果")

print("="*40)

print(f"第一次運行基因數: {len(genes_run1)}")

print(f"第二次運行基因數: {len(genes_run2)}")

print(f"真正穩定的基因數 (交集): {len(stable_genes)}")

print(f"重合率 (Jaccard Index): {jaccard_index:.2%}")

# 4. 顯示前 10 個最穩定的基因

print("\n前 10 個穩定基因範例:")

print(list(stable_genes)[:10])

# 5. 存檔：將穩定基因導出

pd.DataFrame({'stable_genes': list(stable_genes)}).to_csv("stable_genes_intersection.csv", index=False)

print("\n已將穩定基因清單存至: stable_genes_intersection.csv")

import pandas as pd

import gseapy as gp

import networkx as nx

import matplotlib.pyplot as plt

from pathlib import Path

# --- 1. 路徑設定 ---

desktop = Path.home() / "Desktop"

stable_csv = Path("stable_genes_intersection.csv") # AE 產出的檔案

excel_source = desktop / "Aja 21649_LeidCom_clustered.xlsx"

output_dir = desktop / "Autoncoder_Enrichment_Results"

output_dir.mkdir(exist_ok=True)

# --- 2. 數據讀取與映射 (Mapping) ---

print("🔍 正在進行基因映射...")

try:

# 讀取 AE 篩選出的穩定基因 (假設欄位名稱為 stable_genes)

ae_genes = pd.read_csv(stable_csv)['stable_genes'].tolist()

# 讀取原始含有註解的 Excel

df_ref = pd.read_excel(excel_source)

# 進行映射：在 Excel 中找尋 Transcriptid 存在於 ae_genes 清單中的行

# 注意：請確保兩者的名稱格式一致

mapped_df = df_ref[df_ref['Transcriptid'].isin(ae_genes)].copy()

# 提取 Gene_symbol 用於富集分析 (去除空值與重複)

gene_list = mapped_df['Gene_symbol'].replace('#N/A', pd.NA).dropna().unique().tolist()

print(f"✅ 映射完成！AE 篩選 {len(ae_genes)} 個 ID，成功對應到 {len(gene_list)} 個有效基因。")

except Exception as e:

print(f"❌ 讀取或映射失敗: {e}")

exit()

# --- 3. 基因富集分析 (Enrichr) ---

print("🚀 執行 KEGG 富集分析...")

try:

enr = gp.enrichr(gene_list=gene_list,

gene_sets=['KEGG_2021_Human'], # 或選取其他適合的物種資料庫

organism='Human',

cutoff=0.5)

res = enr.results

# 儲存完整富集結果

res.to_excel(output_dir / "AE_Stable_Genes_Enrichment.xlsx", index=False)

# 篩選出顯著的通路 (Adj P < 0.05) 作為繪圖對象

top_paths = res[res['Adjusted P-value'] < 0.1].head(10) # 取前 10 名

if top_paths.empty:

print("⚠️ 無顯著富集通路，調整門檻取前 5 名展示...")

top_paths = res.nsmallest(10, 'Adjusted P-value')

except Exception as e:

print(f"❌ 富集分析失敗: {e}")

exit()

# --- 4. 繪製 Cnetplot (Gene-Concept Network) ---

print("🎨 正在生成 Cnetplot...")

try:

G = nx.Graph()

pathway_label_map = {}

for _, row in top_paths.iterrows():

pathway = row['Term']

p_val = row['Adjusted P-value']

pathway_label_map[pathway] = f"{pathway}\n(p={p_val:.2e})"

# 獲取該通路中的基因並與我們的清單取交集

path_genes = str(row['Genes']).split(';')

valid_genes = [g for g in path_genes if g in gene_list]

# 加入通路節點

G.add_node(pathway, node_type='pathway', size=len(valid_genes))

# 加入基因節點與邊 (限制顯示數量避免畫面太亂)

for gene in valid_genes[:15]:

G.add_node(gene, node_type='gene')

G.add_edge(pathway, gene)

# 繪圖佈局

plt.figure(figsize=(15, 12))

pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)

# 分別繪製通路與基因

path_nodes = [n for n, a in G.nodes(data=True) if a.get('node_type') == 'pathway']

gene_nodes = [n for n, a in G.nodes(data=True) if a.get('node_type') == 'gene']

# 繪製邊

nx.draw_networkx_edges(G, pos, edge_color='#dfe6e9', alpha=0.5)

# 繪製 Pathway 節點 (紅圈)

sizes = [G.nodes[n]['size'] * 500 for n in path_nodes]

nx.draw_networkx_nodes(G, pos, nodelist=path_nodes, node_color='#ff7675',

node_size=sizes, edgecolors='black', label='Pathways')

# 繪製 Gene 節點 (藍圈)

nx.draw_networkx_nodes(G, pos, nodelist=gene_nodes, node_color='#74b9ff',

node_size=150, alpha=0.8, label='Genes')

# 加入標籤 (Pathway)

path_labels = {n: pathway_label_map[n] for n in path_nodes}

nx.draw_networkx_labels(G, pos, labels=path_labels, font_size=10, font_weight='bold')

# 加入標籤 (Gene)

gene_labels = {n: n for n in gene_nodes}

nx.draw_networkx_labels(G, pos, labels=gene_labels, font_size=8)

plt.title("Cnetplot: AE Stable Genes & Top KEGG Pathways", fontsize=15)

plt.axis('off')

# 存檔

plt.savefig(output_dir / "AE_Stable_Cnetplot.png", dpi=300, bbox_inches='tight')

plt.show()

print(f"✅ 分析完成！結果已存至: {output_dir}")

except Exception as e:

print(f"❌ 繪圖失敗: {e}")

import pandas as pd

import gseapy as gp

import networkx as nx

import matplotlib.pyplot as plt

import numpy as np

from pathlib import Path

import traceback

import sys # 修正 exit() 問題

# --- 1. 路徑與參數設定 ---

desktop = Path.home() / "Desktop"

stable_csv = Path("stable_genes_intersection.csv")

excel_source = desktop / "Aja 21649_LeidCom_clustered.xlsx"

output_dir = desktop / "AE_Stability_Network_Analysis"

output_dir.mkdir(exist_ok=True)

# 你的目標數值欄位

value_col = 'avg_diff'

# --- 2. 數據映射與預處理 ---

print("🔍 正在進行基因數據映射...")

try:

ae_ids = pd.read_csv(stable_csv)['stable_genes'].tolist()

df_ref = pd.read_excel(excel_source)

# --- 自動欄位檢查機制 ---

if value_col not in df_ref.columns:

print(f"⚠️ 找不到欄位 '{value_col}'。")

print(f"現有欄位清單: {df_ref.columns.tolist()}")

# 嘗試找尋備用欄位

alternatives = ['log2FoldChange', 'logFC', 'p_val_adj', 'cluster']

found_alt = next((alt for alt in alternatives if alt in df_ref.columns), None)

if found_alt:

value_col = found_alt

print(f"💡 自動切換至備用欄位: '{value_col}'")

else:

print("❗ 找不到任何數值欄位，將建立預設數值 (1.0) 以維持繪圖。")

df_ref['default_value'] = 1.0

value_col = 'default_value'

mapped_df = df_ref[df_ref['Transcriptid'].isin(ae_ids)].copy()

if mapped_df.empty:

print("❌ 映射後沒有任何基因！請檢查 Transcriptid 是否匹配。")

sys.exit()

# 建立對照字典

gene_value_map = mapped_df.groupby('Gene_symbol')[value_col].mean().to_dict()

gene_list = list(gene_value_map.keys())

print(f"✅ 映射完成：{len(gene_list)} 個獨特基因符號。")

except Exception as e:

print(f"❌ 映射出錯: {e}")

traceback.print_exc()

sys.exit() # 修正後的安全退出

# --- 3. 基因富集分析與長條圖繪製 ---

print("🚀 執行 KEGG 富集分析...")

try:

enr = gp.enrichr(gene_list=gene_list,

gene_sets=['KEGG_2021_Human'],

organism='Human',

cutoff=0.5)

res = enr.results

if res.empty or 'Adjusted P-value' not in res.columns:

print("⚠️ 富集分析沒有結果，可能基因清單太少或不匹配。")

else:

res['-log10_Adj_P'] = -np.log10(res['Adjusted P-value'].replace(0, 1e-10))

# 繪製 Pathway Bar Plot

# 修正：確保 n 是整數

n_top = 20

top_10 = res.nsmallest(n_top, 'Adjusted P-value').iloc[::-1]

plt.figure(figsize=(10, 8)) # 稍微增加高度以容納 20 條

colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(top_10)))

bars = plt.barh(top_10['Term'], top_10['-log10_Adj_P'], color=colors)

# 加入顯著性參考線 (P < 0.05 -> -log10(P) > 1.3)

# plt.axvline(x=-np.log10(0.1), color='red', linestyle='--', alpha=0.5, label='P=0.1')

plt.xlabel('-log10(Adjusted P-value)', fontsize=12)

plt.ylabel('Pathway Term', fontsize=12)

plt.title('Top Enriched KEGG Pathways (AE Stable Genes)', fontsize=14)

plt.legend()

plt.tight_layout()

plt.savefig(output_dir / "Pathway_Enrichment_Barplot.png", dpi=300)

plt.show()

# --- 4. 繪製純基因網路圖 (Network Graph) ---

print("🎨 生成穩定基因網路圖...")

G = nx.Graph()

top_paths = res.nsmallest(5, 'Adjusted P-value')

for _, row in top_paths.iterrows():

path_genes = [g for g in str(row['Genes']).split(';') if g in gene_list]

for i in range(len(path_genes)):

for j in range(i + 1, len(path_genes)):

G.add_edge(path_genes[i], path_genes[j])

if len(G.nodes) == 0:

print("⚠️ 基因間無共同通路聯繫，跳過網路圖製作。")

else:

plt.figure(figsize=(12, 12))

pos = nx.kamada_kawai_layout(G)

node_colors = [gene_value_map.get(n, 0) for n in G.nodes()]

nx.draw_networkx_edges(G, pos, alpha=0.2, edge_color='gray')

nodes = nx.draw_networkx_nodes(G, pos, node_size=500, node_color=node_colors,

cmap=plt.cm.RdBu_r, edgecolors='black', linewidths=0.5)

nx.draw_networkx_labels(G, pos, font_size=8, font_weight='bold')

plt.colorbar(nodes, label=f'Value ({value_col})', shrink=0.7)

plt.title("Gene Interaction Network")

plt.axis('off')

plt.savefig(output_dir / "Gene_Interaction_Network.png", dpi=300)

plt.show()

print(f"✨ 分析完成！結果已儲存至: {output_dir}")

except Exception as e:

traceback.print_exc()

L'anguille et le chat (鰻與貓)

Monday, January 05, 2026

Python coding of AUTOENCODER on Japanese eel ovarian transcriptome features-selection

No comments:

Blog Archive