Bläddra i källkod

增强:接入 ML 因子分析页面评估展示

DESKTOP-74CLTRG\Leol 4 dagar sedan
förälder
incheckning
17e5c9360e
3 ändrade filer med 156 tillägg och 5 borttagningar
  1. 65 5
      app.py
  2. 54 0
      app_utils.py
  3. 37 0
      tests/test_app_utils.py

+ 65 - 5
app.py

@@ -18,10 +18,10 @@ from sklearn.cluster import DBSCAN
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 from defect_analysis.data_quality import build_data_quality_report
-from defect_analysis.ml.key_factors import find_key_factors
 from app_utils import (
     apply_defect_filters,
     build_diagnostic_dashboard,
+    build_ml_factor_insights,
     calculate_kpis,
     calculate_spc_metrics,
     generate_industry_diagnosis,
@@ -108,6 +108,17 @@ def load_data_from_csv():
     df = pd.read_csv("defect_data.csv", parse_dates=["timestamp"])
     return normalize_defect_schema(df)
 
+
+@st.cache_data(ttl=300, show_spinner=False)
+def build_cached_ml_factor_insights(data, target_defect_type, model_name, top_n):
+    """缓存 ML 训练洞察,避免页面交互时重复训练。"""
+    return build_ml_factor_insights(
+        data,
+        target_defect_type=target_defect_type,
+        model_name=model_name,
+        top_n=top_n,
+    )
+
 if data_source == "上传CSV文件" and uploaded_df is not None:
     df = uploaded_df
 else:
@@ -605,11 +616,39 @@ _t = get_tab("🔬 ML 因子分析")
 if _t:
     with _t:
         dashboard = build_diagnostic_dashboard(filtered_df)
-        key_factors = find_key_factors(filtered_df, target_defect_type=dashboard["top_defect_type"], top_n=10)
         extended_root_causes = dashboard.get("extended_root_causes")
 
         st.header("根因与关键因子分析")
-        st.markdown("综合规则评分、统计分析与行业维度,输出可解释的异常候选。")
+        st.markdown("综合规则评分、统计分析、机器学习验证与行业维度,输出可解释的异常候选。")
+        ml_col1, ml_col2, ml_col3 = st.columns([1, 1, 1])
+        with ml_col1:
+            ml_target_type = st.selectbox(
+                "目标缺陷",
+                options=sorted(filtered_df["defect_type"].dropna().unique()),
+                index=sorted(filtered_df["defect_type"].dropna().unique()).index(dashboard["top_defect_type"])
+                if dashboard["top_defect_type"] in sorted(filtered_df["defect_type"].dropna().unique())
+                else 0,
+            )
+        with ml_col2:
+            ml_model_name = st.selectbox(
+                "ML 模型",
+                options=["random_forest", "logistic_regression", "xgboost", "lightgbm"],
+                format_func=lambda name: {
+                    "random_forest": "RandomForest",
+                    "logistic_regression": "LogisticRegression",
+                    "xgboost": "XGBoost",
+                    "lightgbm": "LightGBM",
+                }[name],
+            )
+        with ml_col3:
+            ml_top_n = st.slider("候选因子数", min_value=5, max_value=30, value=10, step=5)
+
+        ml_insights = build_cached_ml_factor_insights(
+            filtered_df,
+            ml_target_type,
+            ml_model_name,
+            ml_top_n,
+        )
         st.divider()
 
         if extended_root_causes is not None and not extended_root_causes.empty:
@@ -621,15 +660,36 @@ if _t:
             st.dataframe(extended_table, use_container_width=True, hide_index=True)
             st.caption("覆盖治具、吸嘴、材料批次、清洗/绑定等维度,用于多前制程链路追溯。")
 
+        if ml_insights["error"]:
+            st.warning(f"ML 模型暂不可用:{ml_insights['error']}")
+        else:
+            metric_train = ml_insights["metrics"]
+            metric_valid = ml_insights["validation_metrics"]
+            m1, m2, m3, m4 = st.columns(4)
+            m1.metric("训练准确率", f"{metric_train.get('train_accuracy', 0):.1%}")
+            m2.metric("训练 AUC", f"{metric_train.get('train_auc', 0):.3f}")
+            m3.metric("验证准确率", f"{metric_valid.get('validation_accuracy', 0):.1%}")
+            m4.metric("验证 AUC", f"{metric_valid.get('validation_auc', 0):.3f}")
+
+            importance_df = pd.DataFrame(ml_insights["feature_importance"])
+            if not importance_df.empty:
+                st.subheader("模型特征贡献 TOP")
+                importance_df["importance"] = importance_df["importance"].map(lambda v: round(v, 4))
+                st.dataframe(importance_df.head(15), use_container_width=True, hide_index=True)
+                st.caption("用于判断模型主要依赖哪些设备、座号、材料批次、坐标或缺陷几何特征。")
+
+        key_factors = ml_insights["key_factors"]
         if not key_factors.empty:
-            st.subheader(f"关键因子分析:{dashboard['top_defect_type']}")
+            st.subheader(f"关键因子分析:{ml_insights['target_defect_type']}")
             key_factor_table = key_factors.copy()
             key_factor_table["目标占比"] = key_factor_table["目标占比"].map(lambda v: f"{v:.1%}")
             key_factor_table["基线占比"] = key_factor_table["基线占比"].map(lambda v: f"{v:.1%}")
             key_factor_table["异常倍数"] = key_factor_table["异常倍数"].map(lambda v: f"{v:.2f}x")
             key_factor_table["支持度"] = key_factor_table["支持度"].map(lambda v: f"{v:.1%}")
+            if "ml_probability" in key_factor_table.columns:
+                key_factor_table["ml_probability"] = key_factor_table["ml_probability"].map(lambda v: f"{v:.1%}")
             st.dataframe(key_factor_table, use_container_width=True, hide_index=True)
-            st.caption("关键因子按目标缺陷占比、异常倍数、样本数和支持度综合排序。")
+            st.caption("关键因子按目标缺陷占比、异常倍数、样本数、支持度和模型概率综合排序。")
         else:
             st.info("当前数据未找到显著关键因子,可放宽筛选条件或增加样本量。")
 

+ 54 - 0
app_utils.py

@@ -3,6 +3,8 @@
 import numpy as np
 import pandas as pd
 
+from defect_analysis.ml.model_bundle import create_model_bundle
+from defect_analysis.ml.predict import predict_key_factors
 from defect_analysis.root_cause import EXTENDED_ROOT_CAUSE_DIMENSIONS, build_extended_root_causes
 from defect_analysis.schemas import (
     CORE_REQUIRED_COLUMNS,
@@ -322,3 +324,55 @@ def generate_industry_diagnosis(df, dashboard):
         "patterns": patterns,
         "recommendations": deduped[:5],
     }
+
+
+def build_ml_factor_insights(
+    df,
+    *,
+    target_defect_type=None,
+    target_severity=None,
+    model_name="random_forest",
+    top_n=10,
+):
+    """构建页面可展示的 ML 关键因子、验证指标和特征解释。"""
+    normalized = normalize_defect_schema(df)
+    resolved_target_type = target_defect_type
+    if resolved_target_type is None and not normalized.empty:
+        resolved_target_type = normalized["defect_type"].mode().iloc[0]
+
+    base = {
+        "target_defect_type": resolved_target_type,
+        "target_severity": target_severity,
+        "model_name": model_name,
+        "key_factors": pd.DataFrame(),
+        "metrics": {},
+        "validation_metrics": {},
+        "feature_importance": [],
+        "error": None,
+    }
+    if normalized.empty:
+        base["error"] = "当前筛选条件下没有可训练数据。"
+        return base
+
+    try:
+        base["key_factors"] = predict_key_factors(
+            normalized,
+            target_defect_type=resolved_target_type,
+            target_severity=target_severity,
+            model_name=model_name,
+            top_n=top_n,
+        )
+        bundle = create_model_bundle(
+            normalized,
+            model_name=model_name,
+            target_defect_type=resolved_target_type,
+            target_severity=target_severity,
+        )
+    except (RuntimeError, ValueError) as exc:
+        base["error"] = str(exc)
+        return base
+
+    base["metrics"] = bundle["metrics"]
+    base["validation_metrics"] = bundle["validation_metrics"]
+    base["feature_importance"] = bundle["feature_importance"]
+    return base

+ 37 - 0
tests/test_app_utils.py

@@ -5,6 +5,7 @@ import pandas as pd
 
 from app_utils import (
     apply_defect_filters,
+    build_ml_factor_insights,
     build_diagnostic_dashboard,
     classify_panel_zone,
     calculate_kpis,
@@ -208,6 +209,42 @@ class AppUtilsTest(unittest.TestCase):
         self.assertEqual("FIX-HOT", extended.iloc[0]["候选值"])
         self.assertGreater(extended.iloc[0]["异常倍数"], 1.0)
 
+    def test_ml_factor_insights_include_model_audit_outputs(self):
+        rows = []
+        for i in range(40):
+            hot = i < 24
+            rows.append(
+                {
+                    "defect_id": f"D{i}",
+                    "panel_id": f"P{i}",
+                    "batch_id": "B1",
+                    "equipment_id": "LAM-A01" if hot else "LAM-B01",
+                    "seat_id": "R1C1" if hot else "R2C2",
+                    "inspection_station": "AOI-1",
+                    "timestamp": pd.Timestamp("2026-04-01 08:00:00"),
+                    "defect_type": "气泡" if hot else "划痕",
+                    "severity": "严重" if i % 5 == 0 else "轻微",
+                    "x_mm": 10.0 + i,
+                    "y_mm": 20.0,
+                    "panel_width_mm": 155.0,
+                    "panel_height_mm": 340.0,
+                    "hour": 8,
+                    "shift": "白班",
+                    "day": "2026-04-01",
+                    "lam_fixture_id": "FIX-HOT" if hot else "FIX-OK",
+                    "material_lot_oca": "OCA-HOT" if hot else "OCA-OK",
+                }
+            )
+        df = normalize_defect_schema(pd.DataFrame(rows))
+
+        insights = build_ml_factor_insights(df, target_defect_type="气泡", model_name="random_forest", top_n=5)
+
+        self.assertIsNone(insights["error"])
+        self.assertEqual("气泡", insights["target_defect_type"])
+        self.assertFalse(insights["key_factors"].empty)
+        self.assertIn("validation_auc", insights["validation_metrics"])
+        self.assertGreater(len(insights["feature_importance"]), 0)
+
 
 if __name__ == "__main__":
     unittest.main()