Monday, January 05, 2026

Python coding of AUTOENCODER on Japanese eel ovarian transcriptome features-selection

import scanpy as sc

import anndata as ad

import numpy as np0

import torch

import torch.nn as nn

import torch.optim as optim





# 1. 讀取原始檔案

AData = sc.read_h5ad("6ovaries_analysis_results.h5ad")

adata = sc.read_h5ad("6ovaries_analysis_results.h5ad")


# 2. 檢查目前結構

# 如果讀進來已經是 6 x 20899,就不要執行 .T

# 如果讀進來是 20899 x 6,才執行一次 .T 把它轉正

if adata.n_obs > adata.n_vars:

    adata = adata



print(f"數據維度: {adata.n_obs} 樣本, {adata.n_vars} 基因")

print("目前的 obs 欄位:", adata.obs.columns)


# 標準化與對數轉換 (深度學習對數值範圍敏感)

sc.pp.log1p(adata)

sc.pp.scale(adata, max_value=10) # 建議增加 scale 步驟


# 轉換為 Tensor

X = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X

X_tensor = torch.FloatTensor(X)


# 2. 定義自編碼器模型

class Autoencoder(nn.Module):

    def __init__(self, input_dim, latent_dim=2):

        super(Autoencoder, self).__init__()

        # 編碼器:降低維度

        self.encoder = nn.Sequential(

            nn.Linear(input_dim, 128),

            nn.ReLU(),

            nn.Linear(128, latent_dim),

        )

        # 解碼器:重建原始維度

        self.decoder = nn.Sequential(

            nn.Linear(latent_dim, 128),

            nn.ReLU(),

            nn.Linear(128, input_dim),

        )


    def forward(self, x):

        latent = self.encoder(x)

        reconstructed = self.decoder(latent)

        return reconstructed, latent


# 3. 初始化模型、損失函數與優化器

input_dim = X.shape[1]

latent_dim = 1000 # 只有 6 個樣本,建議將隱藏層設小 (如 2),方便可視化

model = Autoencoder(input_dim, latent_dim)

criterion = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=1e-3)


# 4. 訓練模型

epochs = 1000

model.train()

for epoch in range(epochs):

    # 前向傳播

    reconstructed, latent = model(X_tensor)

    loss = criterion(reconstructed, X_tensor)

    

    # 反向傳播與優化

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

    

    if (epoch + 1) % 20 == 0:

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")


# 5. 提取隱藏層特徵 (Latent Space)

model.eval()

with torch.no_grad():

    _, latent_features = model(X_tensor)

    # 將訓練好的特徵存回 AnnData 的 obsm 中,方便 Scanpy 繪圖

    adata.obsm['X_ae'] = latent_features.numpy()


# 6. 可視化

sc.pl.scatter(adata, basis='ae', color='treatment', title="Autoencoder Latent Space")




import matplotlib.pyplot as plt

import pandas as pd

import numpy as np

import seaborn as sns



# 1. 提取編碼器第一層的權重 (Shape: 64 x 20899)

# model.encoder[0] 對應 nn.Linear(input_dim, 64)

weights = model.encoder[0].weight.detach().numpy()


# 2. 計算每個基因(column)的綜合影響力得分

# 我們取絕對值的總和,因為正向和負向權重都代表該基因具有資訊量

gene_importance = np.sum(np.abs(weights), axis=0)


# 3. 建立 DataFrame 並與基因名稱對應

importance_df = pd.DataFrame({

    'gene_symbol': adata.var_names,

    'importance_score': gene_importance

})


print("-^-" * 30)

# --- 1. 計算統計基準 ---

mean_score = gene_importance.mean()

std_score = gene_importance.std()

suggested_cutoff = mean_score + 2 * std_score


print(f"\n[統計資訊]")

print(f"平均得分 (Mean): {mean_score:.4f}")

print(f"標準差 (Std): {std_score:.4f}")

print(f"建議門檻值 (Mean + 2SD): {suggested_cutoff:.4f}")

print("-" * 30)



# --- 2. 暫停並等待輸入 ---

user_input = input(f"請輸入您想要的 Cutoff 門檻值 (直接按 Enter 則使用建議值 {suggested_cutoff:.4f}): ")


# 判斷輸入邏輯

try:

    if user_input.strip() == "":

        cutoff = suggested_cutoff

    else:

        cutoff = float(user_input)

except ValueError:

    print("輸入格式錯誤,將自動使用建議門檻值。")

    cutoff = suggested_cutoff


print(f"\n確認使用 Cutoff = {cutoff:.4f} 進行篩選...")


plt.figure(figsize=(8, 4))

sns.histplot(importance_df['importance_score'], bins=100, kde=True)

plt.axvline(cutoff, color='red', linestyle='--', label='Mean + 2SD')

plt.title("Distribution of Gene Importance Scores")

plt.legend()

plt.show()


# --- 3. 執行篩選 ---

# 標記是否顯著

importance_df['is_significant'] = importance_df['importance_score'] > cutoff


# 挑選出符合條件的基因並排序

top_genes_df = importance_df[importance_df['is_significant']].sort_values(

    by='importance_score', ascending=False

)


# --- 4. 顯示結果 ---

num_sig = len(top_genes_df)

print(f"篩選完成!共找到 {num_sig} 個顯著基因。")

print('============================'*2)





print("--- Autoencoder 識別出的最具影響力基因 ---  基因數目 =", len(top_genes_df))

print(top_genes_df)


import matplotlib.pyplot as plt

# 5. 可視化:將這些重要基因的得分畫出來

import seaborn as sns

plt.figure(figsize=(10, 6))

sns.barplot(data=top_genes_df, x='importance_score', y='gene_symbol', palette='viridis')

plt.title(f"Influential Genes identified by AE, n = {num_sig}")

plt.xlabel("Cumulative Absolute Weights")

plt.show()


results_dict = {}


for i in range(1, 3): # 跑兩次

    print(f"\n>>> 開始第 {i} 輪訓練...")

    

    # 這裡放入你原本的代碼:定義模型、訓練、計算 importance...

    # (為了簡潔,假設你已將 AE 訓練邏輯包裝成一個函數或直接放在這裡)

    

    # 訓練與篩選邏輯結束後:

    current_genes = set(importance_df[importance_df['importance_score'] > cutoff]['gene_symbol'])

    results_dict[f'run_{i}'] = current_genes

    print(f"第 {i} 輪完成,篩選出 {len(current_genes)} 個基因")


# 最後提取出來

genes_run1 = results_dict['run_1']

genes_run2 = results_dict['run_2']


# 假設第一次篩選出的基因清單為 genes_run1,第二次為 genes_run2

# 這裡根據你提供的 top_genes_df 邏輯提取基因名稱



# 1. 計算交集 (Intersection)

stable_genes = genes_run1.intersection(genes_run2)


# 2. 計算聯集 (Union)

all_detected_genes = genes_run1.union(genes_run2)


# 3. 計算重合率 (Jaccard Index)

jaccard_index = len(stable_genes) / len(all_detected_genes)


print("\n" + "="*40)

print("基因穩定性分析結果")

print("="*40)

print(f"第一次運行基因數: {len(genes_run1)}")

print(f"第二次運行基因數: {len(genes_run2)}")

print(f"真正穩定的基因數 (交集): {len(stable_genes)}")

print(f"重合率 (Jaccard Index): {jaccard_index:.2%}")


# 4. 顯示前 10 個最穩定的基因

print("\n前 10 個穩定基因範例:")

print(list(stable_genes)[:10])


# 5. 存檔:將穩定基因導出

pd.DataFrame({'stable_genes': list(stable_genes)}).to_csv("stable_genes_intersection.csv", index=False)

print("\n已將穩定基因清單存至: stable_genes_intersection.csv")



import pandas as pd

import gseapy as gp

import networkx as nx

import matplotlib.pyplot as plt

from pathlib import Path


# --- 1. 路徑設定 ---

desktop = Path.home() / "Desktop"

stable_csv = Path("stable_genes_intersection.csv") # AE 產出的檔案

excel_source = desktop / "Aja 21649_LeidCom_clustered.xlsx"

output_dir = desktop / "Autoncoder_Enrichment_Results"

output_dir.mkdir(exist_ok=True)


# --- 2. 數據讀取與映射 (Mapping) ---

print("🔍 正在進行基因映射...")

try:

    # 讀取 AE 篩選出的穩定基因 (假設欄位名稱為 stable_genes)

    ae_genes = pd.read_csv(stable_csv)['stable_genes'].tolist()

    

    # 讀取原始含有註解的 Excel

    df_ref = pd.read_excel(excel_source)

    

    # 進行映射:在 Excel 中找尋 Transcriptid 存在於 ae_genes 清單中的行

    # 注意:請確保兩者的名稱格式一致

    mapped_df = df_ref[df_ref['Transcriptid'].isin(ae_genes)].copy()

    

    # 提取 Gene_symbol 用於富集分析 (去除空值與重複)

    gene_list = mapped_df['Gene_symbol'].replace('#N/A', pd.NA).dropna().unique().tolist()

    

    print(f"✅ 映射完成!AE 篩選 {len(ae_genes)} 個 ID,成功對應到 {len(gene_list)} 個有效基因。")

except Exception as e:

    print(f"❌ 讀取或映射失敗: {e}")

    exit()


# --- 3. 基因富集分析 (Enrichr) ---

print("🚀 執行 KEGG 富集分析...")

try:

    enr = gp.enrichr(gene_list=gene_list,

                     gene_sets=['KEGG_2021_Human'], # 或選取其他適合的物種資料庫

                     organism='Human', 

                     cutoff=0.5)

    res = enr.results

    

    # 儲存完整富集結果

    res.to_excel(output_dir / "AE_Stable_Genes_Enrichment.xlsx", index=False)

    

    # 篩選出顯著的通路 (Adj P < 0.05) 作為繪圖對象

    top_paths = res[res['Adjusted P-value'] < 0.1].head(10) # 取前 10 名

    if top_paths.empty:

        print("⚠️ 無顯著富集通路,調整門檻取前 5 名展示...")

        top_paths = res.nsmallest(10, 'Adjusted P-value')


except Exception as e:

    print(f"❌ 富集分析失敗: {e}")

    exit()


# --- 4. 繪製 Cnetplot (Gene-Concept Network) ---

print("🎨 正在生成 Cnetplot...")

try:

    G = nx.Graph()

    pathway_label_map = {}


    for _, row in top_paths.iterrows():

        pathway = row['Term']

        p_val = row['Adjusted P-value']

        pathway_label_map[pathway] = f"{pathway}\n(p={p_val:.2e})"

        

        # 獲取該通路中的基因並與我們的清單取交集

        path_genes = str(row['Genes']).split(';')

        valid_genes = [g for g in path_genes if g in gene_list]

        

        # 加入通路節點

        G.add_node(pathway, node_type='pathway', size=len(valid_genes))

        

        # 加入基因節點與邊 (限制顯示數量避免畫面太亂)

        for gene in valid_genes[:15]: 

            G.add_node(gene, node_type='gene')

            G.add_edge(pathway, gene)


    # 繪圖佈局

    plt.figure(figsize=(15, 12))

    pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)

    

    # 分別繪製通路與基因

    path_nodes = [n for n, a in G.nodes(data=True) if a.get('node_type') == 'pathway']

    gene_nodes = [n for n, a in G.nodes(data=True) if a.get('node_type') == 'gene']


    # 繪製邊

    nx.draw_networkx_edges(G, pos, edge_color='#dfe6e9', alpha=0.5)

    

    # 繪製 Pathway 節點 (紅圈)

    sizes = [G.nodes[n]['size'] * 500 for n in path_nodes]

    nx.draw_networkx_nodes(G, pos, nodelist=path_nodes, node_color='#ff7675', 

                           node_size=sizes, edgecolors='black', label='Pathways')

    

    # 繪製 Gene 節點 (藍圈)

    nx.draw_networkx_nodes(G, pos, nodelist=gene_nodes, node_color='#74b9ff', 

                           node_size=150, alpha=0.8, label='Genes')


    # 加入標籤 (Pathway)

    path_labels = {n: pathway_label_map[n] for n in path_nodes}

    nx.draw_networkx_labels(G, pos, labels=path_labels, font_size=10, font_weight='bold')

    

    # 加入標籤 (Gene)

    gene_labels = {n: n for n in gene_nodes}

    nx.draw_networkx_labels(G, pos, labels=gene_labels, font_size=8)


    plt.title("Cnetplot: AE Stable Genes & Top KEGG Pathways", fontsize=15)

    plt.axis('off')

    

    # 存檔

    plt.savefig(output_dir / "AE_Stable_Cnetplot.png", dpi=300, bbox_inches='tight')

    plt.show()

    print(f"✅ 分析完成!結果已存至: {output_dir}")


except Exception as e:

    print(f"❌ 繪圖失敗: {e}")

    

import pandas as pd

import gseapy as gp

import networkx as nx

import matplotlib.pyplot as plt

import numpy as np

from pathlib import Path

import traceback

import sys  # 修正 exit() 問題


# --- 1. 路徑與參數設定 ---

desktop = Path.home() / "Desktop"

stable_csv = Path("stable_genes_intersection.csv")

excel_source = desktop / "Aja 21649_LeidCom_clustered.xlsx"

output_dir = desktop / "AE_Stability_Network_Analysis"

output_dir.mkdir(exist_ok=True)


# 你的目標數值欄位

value_col = 'avg_diff' 


# --- 2. 數據映射與預處理 ---

print("🔍 正在進行基因數據映射...")

try:

    ae_ids = pd.read_csv(stable_csv)['stable_genes'].tolist()

    df_ref = pd.read_excel(excel_source)

    

    # --- 自動欄位檢查機制 ---

    if value_col not in df_ref.columns:

        print(f"⚠️ 找不到欄位 '{value_col}'。")

        print(f"現有欄位清單: {df_ref.columns.tolist()}")

        

        # 嘗試找尋備用欄位

        alternatives = ['log2FoldChange', 'logFC', 'p_val_adj', 'cluster']

        found_alt = next((alt for alt in alternatives if alt in df_ref.columns), None)

        

        if found_alt:

            value_col = found_alt

            print(f"💡 自動切換至備用欄位: '{value_col}'")

        else:

            print("❗ 找不到任何數值欄位,將建立預設數值 (1.0) 以維持繪圖。")

            df_ref['default_value'] = 1.0

            value_col = 'default_value'


    mapped_df = df_ref[df_ref['Transcriptid'].isin(ae_ids)].copy()

    

    if mapped_df.empty:

        print("❌ 映射後沒有任何基因!請檢查 Transcriptid 是否匹配。")

        sys.exit()


    # 建立對照字典

    gene_value_map = mapped_df.groupby('Gene_symbol')[value_col].mean().to_dict()

    gene_list = list(gene_value_map.keys())

    

    print(f"✅ 映射完成:{len(gene_list)} 個獨特基因符號。")


except Exception as e:

    print(f"❌ 映射出錯: {e}")

    traceback.print_exc()

    sys.exit() # 修正後的安全退出


# --- 3. 基因富集分析與長條圖繪製 ---

print("🚀 執行 KEGG 富集分析...")

try:

    enr = gp.enrichr(gene_list=gene_list,

                     gene_sets=['KEGG_2021_Human'],

                     organism='Human', 

                     cutoff=0.5)

    res = enr.results

    

    if res.empty or 'Adjusted P-value' not in res.columns:

        print("⚠️ 富集分析沒有結果,可能基因清單太少或不匹配。")

    else:

        res['-log10_Adj_P'] = -np.log10(res['Adjusted P-value'].replace(0, 1e-10))

        

# 繪製 Pathway Bar Plot

        # 修正:確保 n 是整數

        n_top = 20 

        top_10 = res.nsmallest(n_top, 'Adjusted P-value').iloc[::-1]

        

        plt.figure(figsize=(10, 8)) # 稍微增加高度以容納 20 條

        colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(top_10)))

        

        bars = plt.barh(top_10['Term'], top_10['-log10_Adj_P'], color=colors)

        

        # 加入顯著性參考線 (P < 0.05 -> -log10(P) > 1.3)

#        plt.axvline(x=-np.log10(0.1), color='red', linestyle='--', alpha=0.5, label='P=0.1')

        

        plt.xlabel('-log10(Adjusted P-value)', fontsize=12)

        plt.ylabel('Pathway Term', fontsize=12)

        plt.title('Top Enriched KEGG Pathways (AE Stable Genes)', fontsize=14)

        plt.legend()

        plt.tight_layout()

        

        plt.savefig(output_dir / "Pathway_Enrichment_Barplot.png", dpi=300)

        plt.show()


# --- 4. 繪製純基因網路圖 (Network Graph) ---

    print("🎨 生成穩定基因網路圖...")

    G = nx.Graph()

    top_paths = res.nsmallest(5, 'Adjusted P-value')

    

    for _, row in top_paths.iterrows():

        path_genes = [g for g in str(row['Genes']).split(';') if g in gene_list]

        for i in range(len(path_genes)):

            for j in range(i + 1, len(path_genes)):

                G.add_edge(path_genes[i], path_genes[j])


    if len(G.nodes) == 0:

        print("⚠️ 基因間無共同通路聯繫,跳過網路圖製作。")

    else:

        plt.figure(figsize=(12, 12))

        pos = nx.kamada_kawai_layout(G)

        node_colors = [gene_value_map.get(n, 0) for n in G.nodes()]

        

        nx.draw_networkx_edges(G, pos, alpha=0.2, edge_color='gray')

        nodes = nx.draw_networkx_nodes(G, pos, node_size=500, node_color=node_colors, 

                                       cmap=plt.cm.RdBu_r, edgecolors='black', linewidths=0.5)

        nx.draw_networkx_labels(G, pos, font_size=8, font_weight='bold')

        

        plt.colorbar(nodes, label=f'Value ({value_col})', shrink=0.7)

        plt.title("Gene Interaction Network")

        plt.axis('off')

        plt.savefig(output_dir / "Gene_Interaction_Network.png", dpi=300)

        plt.show()


    print(f"✨ 分析完成!結果已儲存至: {output_dir}")


except Exception as e:

    traceback.print_exc()

No comments: