2 месяцев назад · 17e5c9360e
--- a/app.py
+++ b/app.py
@@ -18,10 +18,10 @@ from sklearn.cluster import DBSCAN
 
															 from sklearn.decomposition import PCA
														
 
															 from sklearn.preprocessing import StandardScaler
														
 
															 from defect_analysis.data_quality import build_data_quality_report
														
 
															-from defect_analysis.ml.key_factors import find_key_factors
														
 
															 from app_utils import (
														
 
															     apply_defect_filters,
														
 
															     build_diagnostic_dashboard,
														
 
															+    build_ml_factor_insights,
														
 
															     calculate_kpis,
														
 
															     calculate_spc_metrics,
														
 
															     generate_industry_diagnosis,
														
@@ -108,6 +108,17 @@ def load_data_from_csv():
 
															     df = pd.read_csv("defect_data.csv", parse_dates=["timestamp"])
														
 
															     return normalize_defect_schema(df)
														
 
															+
														
 
															+@st.cache_data(ttl=300, show_spinner=False)
														
 
															+def build_cached_ml_factor_insights(data, target_defect_type, model_name, top_n):
														
 
															+    """缓存 ML 训练洞察，避免页面交互时重复训练。"""
														
 
															+    return build_ml_factor_insights(
														
 
															+        data,
														
 
															+        target_defect_type=target_defect_type,
														
 
															+        model_name=model_name,
														
 
															+        top_n=top_n,
														
 
															+    )
														
 
															+
														
 
															 if data_source == "上传CSV文件" and uploaded_df is not None:
														
 
															     df = uploaded_df
														
 
															 else:
														
@@ -605,11 +616,39 @@ _t = get_tab("🔬 ML 因子分析")
 
															 if _t:
														
 
															     with _t:
														
 
															         dashboard = build_diagnostic_dashboard(filtered_df)
														
 
															-        key_factors = find_key_factors(filtered_df, target_defect_type=dashboard["top_defect_type"], top_n=10)
														
 
															         extended_root_causes = dashboard.get("extended_root_causes")
														
 
															         st.header("根因与关键因子分析")
														
 
															-        st.markdown("综合规则评分、统计分析与行业维度，输出可解释的异常候选。")
														
 
															+        st.markdown("综合规则评分、统计分析、机器学习验证与行业维度，输出可解释的异常候选。")
														
 
															+        ml_col1, ml_col2, ml_col3 = st.columns([1, 1, 1])
														
 
															+        with ml_col1:
														
 
															+            ml_target_type = st.selectbox(
														
 
															+                "目标缺陷",
														
 
															+                options=sorted(filtered_df["defect_type"].dropna().unique()),
														
 
															+                index=sorted(filtered_df["defect_type"].dropna().unique()).index(dashboard["top_defect_type"])
														
 
															+                if dashboard["top_defect_type"] in sorted(filtered_df["defect_type"].dropna().unique())
														
 
															+                else 0,
														
 
															+            )
														
 
															+        with ml_col2:
														
 
															+            ml_model_name = st.selectbox(
														
 
															+                "ML 模型",
														
 
															+                options=["random_forest", "logistic_regression", "xgboost", "lightgbm"],
														
 
															+                format_func=lambda name: {
														
 
															+                    "random_forest": "RandomForest",
														
 
															+                    "logistic_regression": "LogisticRegression",
														
 
															+                    "xgboost": "XGBoost",
														
 
															+                    "lightgbm": "LightGBM",
														
 
															+                }[name],
														
 
															+            )
														
 
															+        with ml_col3:
														
 
															+            ml_top_n = st.slider("候选因子数", min_value=5, max_value=30, value=10, step=5)
														
 
															+
														
 
															+        ml_insights = build_cached_ml_factor_insights(
														
 
															+            filtered_df,
														
 
															+            ml_target_type,
														
 
															+            ml_model_name,
														
 
															+            ml_top_n,
														
 
															+        )
														
 
															         st.divider()
														
 
															         if extended_root_causes is not None and not extended_root_causes.empty:
														
@@ -621,15 +660,36 @@ if _t:
 
															             st.dataframe(extended_table, use_container_width=True, hide_index=True)
														
 
															             st.caption("覆盖治具、吸嘴、材料批次、清洗/绑定等维度，用于多前制程链路追溯。")
														
 
															+        if ml_insights["error"]:
														
 
															+            st.warning(f"ML 模型暂不可用：{ml_insights['error']}")
														
 
															+        else:
														
 
															+            metric_train = ml_insights["metrics"]
														
 
															+            metric_valid = ml_insights["validation_metrics"]
														
 
															+            m1, m2, m3, m4 = st.columns(4)
														
 
															+            m1.metric("训练准确率", f"{metric_train.get('train_accuracy', 0):.1%}")
														
 
															+            m2.metric("训练 AUC", f"{metric_train.get('train_auc', 0):.3f}")
														
 
															+            m3.metric("验证准确率", f"{metric_valid.get('validation_accuracy', 0):.1%}")
														
 
															+            m4.metric("验证 AUC", f"{metric_valid.get('validation_auc', 0):.3f}")
														
 
															+
														
 
															+            importance_df = pd.DataFrame(ml_insights["feature_importance"])
														
 
															+            if not importance_df.empty:
														
 
															+                st.subheader("模型特征贡献 TOP")
														
 
															+                importance_df["importance"] = importance_df["importance"].map(lambda v: round(v, 4))
														
 
															+                st.dataframe(importance_df.head(15), use_container_width=True, hide_index=True)
														
 
															+                st.caption("用于判断模型主要依赖哪些设备、座号、材料批次、坐标或缺陷几何特征。")
														
 
															+
														
 
															+        key_factors = ml_insights["key_factors"]
														
 
															         if not key_factors.empty:
														
 
															-            st.subheader(f"关键因子分析：{dashboard['top_defect_type']}")
														
 
															+            st.subheader(f"关键因子分析：{ml_insights['target_defect_type']}")
														
 
															             key_factor_table = key_factors.copy()
														
 
															             key_factor_table["目标占比"] = key_factor_table["目标占比"].map(lambda v: f"{v:.1%}")
														
 
															             key_factor_table["基线占比"] = key_factor_table["基线占比"].map(lambda v: f"{v:.1%}")
														
 
															             key_factor_table["异常倍数"] = key_factor_table["异常倍数"].map(lambda v: f"{v:.2f}x")
														
 
															             key_factor_table["支持度"] = key_factor_table["支持度"].map(lambda v: f"{v:.1%}")
														
 
															+            if "ml_probability" in key_factor_table.columns:
														
 
															+                key_factor_table["ml_probability"] = key_factor_table["ml_probability"].map(lambda v: f"{v:.1%}")
														
 
															             st.dataframe(key_factor_table, use_container_width=True, hide_index=True)
														
 
															-            st.caption("关键因子按目标缺陷占比、异常倍数、样本数和支持度综合排序。")
														
 
															+            st.caption("关键因子按目标缺陷占比、异常倍数、样本数、支持度和模型概率综合排序。")
														
 
															         else:
														
 
															             st.info("当前数据未找到显著关键因子，可放宽筛选条件或增加样本量。")
														
--- a/app_utils.py
+++ b/app_utils.py
@@ -3,6 +3,8 @@
 
															 import numpy as np
														
 
															 import pandas as pd
														
 
															+from defect_analysis.ml.model_bundle import create_model_bundle
														
 
															+from defect_analysis.ml.predict import predict_key_factors
														
 
															 from defect_analysis.root_cause import EXTENDED_ROOT_CAUSE_DIMENSIONS, build_extended_root_causes
														
 
															 from defect_analysis.schemas import (
														
 
															     CORE_REQUIRED_COLUMNS,
														
@@ -322,3 +324,55 @@ def generate_industry_diagnosis(df, dashboard):
 
															         "patterns": patterns,
														
 
															         "recommendations": deduped[:5],
														
 
															     }
														
 
															+
														
 
															+
														
 
															+def build_ml_factor_insights(
														
 
															+    df,
														
 
															+    *,
														
 
															+    target_defect_type=None,
														
 
															+    target_severity=None,
														
 
															+    model_name="random_forest",
														
 
															+    top_n=10,
														
 
															+):
														
 
															+    """构建页面可展示的 ML 关键因子、验证指标和特征解释。"""
														
 
															+    normalized = normalize_defect_schema(df)
														
 
															+    resolved_target_type = target_defect_type
														
 
															+    if resolved_target_type is None and not normalized.empty:
														
 
															+        resolved_target_type = normalized["defect_type"].mode().iloc[0]
														
 
															+
														
 
															+    base = {
														
 
															+        "target_defect_type": resolved_target_type,
														
 
															+        "target_severity": target_severity,
														
 
															+        "model_name": model_name,
														
 
															+        "key_factors": pd.DataFrame(),
														
 
															+        "metrics": {},
														
 
															+        "validation_metrics": {},
														
 
															+        "feature_importance": [],
														
 
															+        "error": None,
														
 
															+    }
														
 
															+    if normalized.empty:
														
 
															+        base["error"] = "当前筛选条件下没有可训练数据。"
														
 
															+        return base
														
 
															+
														
 
															+    try:
														
 
															+        base["key_factors"] = predict_key_factors(
														
 
															+            normalized,
														
 
															+            target_defect_type=resolved_target_type,
														
 
															+            target_severity=target_severity,
														
 
															+            model_name=model_name,
														
 
															+            top_n=top_n,
														
 
															+        )
														
 
															+        bundle = create_model_bundle(
														
 
															+            normalized,
														
 
															+            model_name=model_name,
														
 
															+            target_defect_type=resolved_target_type,
														
 
															+            target_severity=target_severity,
														
 
															+        )
														
 
															+    except (RuntimeError, ValueError) as exc:
														
 
															+        base["error"] = str(exc)
														
 
															+        return base
														
 
															+
														
 
															+    base["metrics"] = bundle["metrics"]
														
 
															+    base["validation_metrics"] = bundle["validation_metrics"]
														
 
															+    base["feature_importance"] = bundle["feature_importance"]
														
 
															+    return base
														
--- a/tests/test_app_utils.py
+++ b/tests/test_app_utils.py
@@ -5,6 +5,7 @@ import pandas as pd
 
															 from app_utils import (
														
 
															     apply_defect_filters,
														
 
															+    build_ml_factor_insights,
														
 
															     build_diagnostic_dashboard,
														
 
															     classify_panel_zone,
														
 
															     calculate_kpis,
														
@@ -208,6 +209,42 @@ class AppUtilsTest(unittest.TestCase):
 
															         self.assertEqual("FIX-HOT", extended.iloc[0]["候选值"])
														
 
															         self.assertGreater(extended.iloc[0]["异常倍数"], 1.0)
														
 
															+    def test_ml_factor_insights_include_model_audit_outputs(self):
														
 
															+        rows = []
														
 
															+        for i in range(40):
														
 
															+            hot = i < 24
														
 
															+            rows.append(
														
 
															+                {
														
 
															+                    "defect_id": f"D{i}",
														
 
															+                    "panel_id": f"P{i}",
														
 
															+                    "batch_id": "B1",
														
 
															+                    "equipment_id": "LAM-A01" if hot else "LAM-B01",
														
 
															+                    "seat_id": "R1C1" if hot else "R2C2",
														
 
															+                    "inspection_station": "AOI-1",
														
 
															+                    "timestamp": pd.Timestamp("2026-04-01 08:00:00"),
														
 
															+                    "defect_type": "气泡" if hot else "划痕",
														
 
															+                    "severity": "严重" if i % 5 == 0 else "轻微",
														
 
															+                    "x_mm": 10.0 + i,
														
 
															+                    "y_mm": 20.0,
														
 
															+                    "panel_width_mm": 155.0,
														
 
															+                    "panel_height_mm": 340.0,
														
 
															+                    "hour": 8,
														
 
															+                    "shift": "白班",
														
 
															+                    "day": "2026-04-01",
														
 
															+                    "lam_fixture_id": "FIX-HOT" if hot else "FIX-OK",
														
 
															+                    "material_lot_oca": "OCA-HOT" if hot else "OCA-OK",
														
 
															+                }
														
 
															+            )
														
 
															+        df = normalize_defect_schema(pd.DataFrame(rows))
														
 
															+
														
 
															+        insights = build_ml_factor_insights(df, target_defect_type="气泡", model_name="random_forest", top_n=5)
														
 
															+
														
 
															+        self.assertIsNone(insights["error"])
														
 
															+        self.assertEqual("气泡", insights["target_defect_type"])
														
 
															+        self.assertFalse(insights["key_factors"].empty)
														
 
															+        self.assertIn("validation_auc", insights["validation_metrics"])
														
 
															+        self.assertGreater(len(insights["feature_importance"]), 0)
														
 
															+
														
 
															 if __name__ == "__main__":
														
 
															     unittest.main()