2 månader sedan · 17e5c9360e
--- a/app.py
+++ b/app.py
@@ -18,10 +18,10 @@ from sklearn.cluster import DBSCAN
 
				 from sklearn.decomposition import PCA
			
 
				 from sklearn.preprocessing import StandardScaler
			
 
				 from defect_analysis.data_quality import build_data_quality_report
			
 
				-from defect_analysis.ml.key_factors import find_key_factors
			
 
				 from app_utils import (
			
 
				     apply_defect_filters,
			
 
				     build_diagnostic_dashboard,
			
 
				+    build_ml_factor_insights,
			
 
				     calculate_kpis,
			
 
				     calculate_spc_metrics,
			
 
				     generate_industry_diagnosis,
			
@@ -108,6 +108,17 @@ def load_data_from_csv():
 
				     df = pd.read_csv("defect_data.csv", parse_dates=["timestamp"])
			
 
				     return normalize_defect_schema(df)
			
 
				 
			
 
				+
			
 
				+@st.cache_data(ttl=300, show_spinner=False)
			
 
				+def build_cached_ml_factor_insights(data, target_defect_type, model_name, top_n):
			
 
				+    """缓存 ML 训练洞察，避免页面交互时重复训练。"""
			
 
				+    return build_ml_factor_insights(
			
 
				+        data,
			
 
				+        target_defect_type=target_defect_type,
			
 
				+        model_name=model_name,
			
 
				+        top_n=top_n,
			
 
				+    )
			
 
				+
			
 
				 if data_source == "上传CSV文件" and uploaded_df is not None:
			
 
				     df = uploaded_df
			
 
				 else:
			
@@ -605,11 +616,39 @@ _t = get_tab("🔬 ML 因子分析")
 
				 if _t:
			
 
				     with _t:
			
 
				         dashboard = build_diagnostic_dashboard(filtered_df)
			
 
				-        key_factors = find_key_factors(filtered_df, target_defect_type=dashboard["top_defect_type"], top_n=10)
			
 
				         extended_root_causes = dashboard.get("extended_root_causes")
			
 
				 
			
 
				         st.header("根因与关键因子分析")
			
 
				-        st.markdown("综合规则评分、统计分析与行业维度，输出可解释的异常候选。")
			
 
				+        st.markdown("综合规则评分、统计分析、机器学习验证与行业维度，输出可解释的异常候选。")
			
 
				+        ml_col1, ml_col2, ml_col3 = st.columns([1, 1, 1])
			
 
				+        with ml_col1:
			
 
				+            ml_target_type = st.selectbox(
			
 
				+                "目标缺陷",
			
 
				+                options=sorted(filtered_df["defect_type"].dropna().unique()),
			
 
				+                index=sorted(filtered_df["defect_type"].dropna().unique()).index(dashboard["top_defect_type"])
			
 
				+                if dashboard["top_defect_type"] in sorted(filtered_df["defect_type"].dropna().unique())
			
 
				+                else 0,
			
 
				+            )
			
 
				+        with ml_col2:
			
 
				+            ml_model_name = st.selectbox(
			
 
				+                "ML 模型",
			
 
				+                options=["random_forest", "logistic_regression", "xgboost", "lightgbm"],
			
 
				+                format_func=lambda name: {
			
 
				+                    "random_forest": "RandomForest",
			
 
				+                    "logistic_regression": "LogisticRegression",
			
 
				+                    "xgboost": "XGBoost",
			
 
				+                    "lightgbm": "LightGBM",
			
 
				+                }[name],
			
 
				+            )
			
 
				+        with ml_col3:
			
 
				+            ml_top_n = st.slider("候选因子数", min_value=5, max_value=30, value=10, step=5)
			
 
				+
			
 
				+        ml_insights = build_cached_ml_factor_insights(
			
 
				+            filtered_df,
			
 
				+            ml_target_type,
			
 
				+            ml_model_name,
			
 
				+            ml_top_n,
			
 
				+        )
			
 
				         st.divider()
			
 
				 
			
 
				         if extended_root_causes is not None and not extended_root_causes.empty:
			
@@ -621,15 +660,36 @@ if _t:
 
				             st.dataframe(extended_table, use_container_width=True, hide_index=True)
			
 
				             st.caption("覆盖治具、吸嘴、材料批次、清洗/绑定等维度，用于多前制程链路追溯。")
			
 
				 
			
 
				+        if ml_insights["error"]:
			
 
				+            st.warning(f"ML 模型暂不可用：{ml_insights['error']}")
			
 
				+        else:
			
 
				+            metric_train = ml_insights["metrics"]
			
 
				+            metric_valid = ml_insights["validation_metrics"]
			
 
				+            m1, m2, m3, m4 = st.columns(4)
			
 
				+            m1.metric("训练准确率", f"{metric_train.get('train_accuracy', 0):.1%}")
			
 
				+            m2.metric("训练 AUC", f"{metric_train.get('train_auc', 0):.3f}")
			
 
				+            m3.metric("验证准确率", f"{metric_valid.get('validation_accuracy', 0):.1%}")
			
 
				+            m4.metric("验证 AUC", f"{metric_valid.get('validation_auc', 0):.3f}")
			
 
				+
			
 
				+            importance_df = pd.DataFrame(ml_insights["feature_importance"])
			
 
				+            if not importance_df.empty:
			
 
				+                st.subheader("模型特征贡献 TOP")
			
 
				+                importance_df["importance"] = importance_df["importance"].map(lambda v: round(v, 4))
			
 
				+                st.dataframe(importance_df.head(15), use_container_width=True, hide_index=True)
			
 
				+                st.caption("用于判断模型主要依赖哪些设备、座号、材料批次、坐标或缺陷几何特征。")
			
 
				+
			
 
				+        key_factors = ml_insights["key_factors"]
			
 
				         if not key_factors.empty:
			
 
				-            st.subheader(f"关键因子分析：{dashboard['top_defect_type']}")
			
 
				+            st.subheader(f"关键因子分析：{ml_insights['target_defect_type']}")
			
 
				             key_factor_table = key_factors.copy()
			
 
				             key_factor_table["目标占比"] = key_factor_table["目标占比"].map(lambda v: f"{v:.1%}")
			
 
				             key_factor_table["基线占比"] = key_factor_table["基线占比"].map(lambda v: f"{v:.1%}")
			
 
				             key_factor_table["异常倍数"] = key_factor_table["异常倍数"].map(lambda v: f"{v:.2f}x")
			
 
				             key_factor_table["支持度"] = key_factor_table["支持度"].map(lambda v: f"{v:.1%}")
			
 
				+            if "ml_probability" in key_factor_table.columns:
			
 
				+                key_factor_table["ml_probability"] = key_factor_table["ml_probability"].map(lambda v: f"{v:.1%}")
			
 
				             st.dataframe(key_factor_table, use_container_width=True, hide_index=True)
			
 
				-            st.caption("关键因子按目标缺陷占比、异常倍数、样本数和支持度综合排序。")
			
 
				+            st.caption("关键因子按目标缺陷占比、异常倍数、样本数、支持度和模型概率综合排序。")
			
 
				         else:
			
 
				             st.info("当前数据未找到显著关键因子，可放宽筛选条件或增加样本量。")
			
 
				 
			
--- a/app_utils.py
+++ b/app_utils.py
@@ -3,6 +3,8 @@
 
				 import numpy as np
			
 
				 import pandas as pd
			
 
				 
			
 
				+from defect_analysis.ml.model_bundle import create_model_bundle
			
 
				+from defect_analysis.ml.predict import predict_key_factors
			
 
				 from defect_analysis.root_cause import EXTENDED_ROOT_CAUSE_DIMENSIONS, build_extended_root_causes
			
 
				 from defect_analysis.schemas import (
			
 
				     CORE_REQUIRED_COLUMNS,
			
@@ -322,3 +324,55 @@ def generate_industry_diagnosis(df, dashboard):
 
				         "patterns": patterns,
			
 
				         "recommendations": deduped[:5],
			
 
				     }
			
 
				+
			
 
				+
			
 
				+def build_ml_factor_insights(
			
 
				+    df,
			
 
				+    *,
			
 
				+    target_defect_type=None,
			
 
				+    target_severity=None,
			
 
				+    model_name="random_forest",
			
 
				+    top_n=10,
			
 
				+):
			
 
				+    """构建页面可展示的 ML 关键因子、验证指标和特征解释。"""
			
 
				+    normalized = normalize_defect_schema(df)
			
 
				+    resolved_target_type = target_defect_type
			
 
				+    if resolved_target_type is None and not normalized.empty:
			
 
				+        resolved_target_type = normalized["defect_type"].mode().iloc[0]
			
 
				+
			
 
				+    base = {
			
 
				+        "target_defect_type": resolved_target_type,
			
 
				+        "target_severity": target_severity,
			
 
				+        "model_name": model_name,
			
 
				+        "key_factors": pd.DataFrame(),
			
 
				+        "metrics": {},
			
 
				+        "validation_metrics": {},
			
 
				+        "feature_importance": [],
			
 
				+        "error": None,
			
 
				+    }
			
 
				+    if normalized.empty:
			
 
				+        base["error"] = "当前筛选条件下没有可训练数据。"
			
 
				+        return base
			
 
				+
			
 
				+    try:
			
 
				+        base["key_factors"] = predict_key_factors(
			
 
				+            normalized,
			
 
				+            target_defect_type=resolved_target_type,
			
 
				+            target_severity=target_severity,
			
 
				+            model_name=model_name,
			
 
				+            top_n=top_n,
			
 
				+        )
			
 
				+        bundle = create_model_bundle(
			
 
				+            normalized,
			
 
				+            model_name=model_name,
			
 
				+            target_defect_type=resolved_target_type,
			
 
				+            target_severity=target_severity,
			
 
				+        )
			
 
				+    except (RuntimeError, ValueError) as exc:
			
 
				+        base["error"] = str(exc)
			
 
				+        return base
			
 
				+
			
 
				+    base["metrics"] = bundle["metrics"]
			
 
				+    base["validation_metrics"] = bundle["validation_metrics"]
			
 
				+    base["feature_importance"] = bundle["feature_importance"]
			
 
				+    return base
			
--- a/tests/test_app_utils.py
+++ b/tests/test_app_utils.py
@@ -5,6 +5,7 @@ import pandas as pd
 
				 
			
 
				 from app_utils import (
			
 
				     apply_defect_filters,
			
 
				+    build_ml_factor_insights,
			
 
				     build_diagnostic_dashboard,
			
 
				     classify_panel_zone,
			
 
				     calculate_kpis,
			
@@ -208,6 +209,42 @@ class AppUtilsTest(unittest.TestCase):
 
				         self.assertEqual("FIX-HOT", extended.iloc[0]["候选值"])
			
 
				         self.assertGreater(extended.iloc[0]["异常倍数"], 1.0)
			
 
				 
			
 
				+    def test_ml_factor_insights_include_model_audit_outputs(self):
			
 
				+        rows = []
			
 
				+        for i in range(40):
			
 
				+            hot = i < 24
			
 
				+            rows.append(
			
 
				+                {
			
 
				+                    "defect_id": f"D{i}",
			
 
				+                    "panel_id": f"P{i}",
			
 
				+                    "batch_id": "B1",
			
 
				+                    "equipment_id": "LAM-A01" if hot else "LAM-B01",
			
 
				+                    "seat_id": "R1C1" if hot else "R2C2",
			
 
				+                    "inspection_station": "AOI-1",
			
 
				+                    "timestamp": pd.Timestamp("2026-04-01 08:00:00"),
			
 
				+                    "defect_type": "气泡" if hot else "划痕",
			
 
				+                    "severity": "严重" if i % 5 == 0 else "轻微",
			
 
				+                    "x_mm": 10.0 + i,
			
 
				+                    "y_mm": 20.0,
			
 
				+                    "panel_width_mm": 155.0,
			
 
				+                    "panel_height_mm": 340.0,
			
 
				+                    "hour": 8,
			
 
				+                    "shift": "白班",
			
 
				+                    "day": "2026-04-01",
			
 
				+                    "lam_fixture_id": "FIX-HOT" if hot else "FIX-OK",
			
 
				+                    "material_lot_oca": "OCA-HOT" if hot else "OCA-OK",
			
 
				+                }
			
 
				+            )
			
 
				+        df = normalize_defect_schema(pd.DataFrame(rows))
			
 
				+
			
 
				+        insights = build_ml_factor_insights(df, target_defect_type="气泡", model_name="random_forest", top_n=5)
			
 
				+
			
 
				+        self.assertIsNone(insights["error"])
			
 
				+        self.assertEqual("气泡", insights["target_defect_type"])
			
 
				+        self.assertFalse(insights["key_factors"].empty)
			
 
				+        self.assertIn("validation_auc", insights["validation_metrics"])
			
 
				+        self.assertGreater(len(insights["feature_importance"]), 0)
			
 
				+
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     unittest.main()