소스 검색

增强:增加模型验证评估与特征解释

DESKTOP-74CLTRG\Leol 4 일 전
부모
커밋
7c076fa2c4
4개의 변경된 파일85개의 추가작업 그리고 8개의 파일을 삭제
  1. 17 1
      defect_analysis/ml/model_bundle.py
  2. 27 7
      defect_analysis/ml/tabular_models.py
  3. 20 0
      tests/test_ml_platform.py
  4. 21 0
      train_ml_models.py

+ 17 - 1
defect_analysis/ml/model_bundle.py

@@ -4,11 +4,12 @@ from datetime import datetime, timezone
 
 import joblib
 import pandas as pd
+from sklearn.model_selection import train_test_split
 
 from defect_analysis.ml.datasets import build_supervised_dataset
 from defect_analysis.ml.features import build_feature_frame
 from defect_analysis.ml.model_registry import detect_optional_model_backends
-from defect_analysis.ml.tabular_models import train_tabular_model
+from defect_analysis.ml.tabular_models import classification_metrics, extract_feature_importance, train_tabular_model
 from defect_analysis.schemas import normalize_defect_schema
 
 
@@ -36,6 +37,7 @@ def create_model_bundle(
     target_defect_type=None,
     target_severity=None,
     random_state=42,
+    test_size=0.25,
 ):
     """训练并创建可保存的模型包。"""
     normalized = normalize_defect_schema(df)
@@ -47,7 +49,19 @@ def create_model_bundle(
     if y.nunique() < 2:
         raise ValueError("目标标签只有一个类别,无法训练监督模型")
 
+    stratify = y if y.value_counts().min() >= 2 else None
+    X_train, X_valid, y_train, y_valid = train_test_split(
+        X,
+        y,
+        test_size=test_size,
+        random_state=random_state,
+        stratify=stratify,
+    )
+    validation_model = train_tabular_model(model_name, X_train, y_train, random_state=random_state)["model"]
+    validation_metrics = classification_metrics(validation_model, X_valid, y_valid, prefix="validation")
+
     trained = train_tabular_model(model_name, X, y, random_state=random_state)
+    feature_importance = extract_feature_importance(trained["model"], X.columns)
     return {
         "bundle_version": MODEL_BUNDLE_VERSION,
         "created_at": datetime.now(timezone.utc).isoformat(),
@@ -55,6 +69,8 @@ def create_model_bundle(
         "target": _target_config(target_defect_type, target_severity),
         "feature_columns": list(X.columns),
         "metrics": trained.get("metrics", {}),
+        "validation_metrics": validation_metrics,
+        "feature_importance": feature_importance,
         "optional_backends": detect_optional_model_backends(),
         "model": trained["model"],
     }

+ 27 - 7
defect_analysis/ml/tabular_models.py

@@ -7,14 +7,34 @@ from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 
 
-def _classification_metrics(model, X, y):
+def classification_metrics(model, X, y, *, prefix="train"):
+    """计算二分类评估指标。"""
     pred = model.predict(X)
-    metrics = {"train_accuracy": float(accuracy_score(y, pred))}
+    metrics = {f"{prefix}_accuracy": float(accuracy_score(y, pred))}
     if hasattr(model, "predict_proba") and len(set(y)) > 1:
-        metrics["train_auc"] = float(roc_auc_score(y, model.predict_proba(X)[:, 1]))
+        metrics[f"{prefix}_auc"] = float(roc_auc_score(y, model.predict_proba(X)[:, 1]))
     return metrics
 
 
+def extract_feature_importance(model, feature_columns, *, top_n=20):
+    """提取模型特征贡献,用于工程复盘和模型审计。"""
+    if hasattr(model, "feature_importances_"):
+        values = model.feature_importances_
+    elif hasattr(model, "named_steps") and "logisticregression" in model.named_steps:
+        values = abs(model.named_steps["logisticregression"].coef_[0])
+    elif hasattr(model, "coef_"):
+        values = abs(model.coef_[0])
+    else:
+        return []
+
+    rows = [
+        {"feature": feature, "importance": float(importance)}
+        for feature, importance in zip(feature_columns, values)
+    ]
+    rows.sort(key=lambda item: item["importance"], reverse=True)
+    return rows[:top_n]
+
+
 def train_tabular_model(model_name, X, y=None, *, random_state=42):
     """训练表格模型。
 
@@ -31,7 +51,7 @@ def train_tabular_model(model_name, X, y=None, *, random_state=42):
             class_weight="balanced",
         )
         model.fit(X, y)
-        return {"model_name": model_name, "model": model, "metrics": _classification_metrics(model, X, y)}
+        return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
 
     if model_name == "logistic_regression":
         if y is None:
@@ -41,7 +61,7 @@ def train_tabular_model(model_name, X, y=None, *, random_state=42):
             LogisticRegression(max_iter=3000, class_weight="balanced", solver="liblinear"),
         )
         model.fit(X, y)
-        return {"model_name": model_name, "model": model, "metrics": _classification_metrics(model, X, y)}
+        return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
 
     if model_name == "isolation_forest":
         model = IsolationForest(n_estimators=100, contamination="auto", random_state=random_state)
@@ -64,7 +84,7 @@ def train_tabular_model(model_name, X, y=None, *, random_state=42):
             random_state=random_state,
         )
         model.fit(X, y)
-        return {"model_name": model_name, "model": model, "metrics": _classification_metrics(model, X, y)}
+        return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
 
     if model_name == "lightgbm":
         if y is None:
@@ -81,6 +101,6 @@ def train_tabular_model(model_name, X, y=None, *, random_state=42):
             verbose=-1,
         )
         model.fit(X, y)
-        return {"model_name": model_name, "model": model, "metrics": _classification_metrics(model, X, y)}
+        return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
 
     raise ValueError(f"不支持的模型: {model_name}")

+ 20 - 0
tests/test_ml_platform.py

@@ -15,6 +15,7 @@ from defect_analysis.ml.model_registry import detect_optional_model_backends
 from defect_analysis.ml.predict import predict_key_factors
 from defect_analysis.ml.tabular_models import train_tabular_model
 from defect_analysis.schemas import normalize_defect_schema
+from train_ml_models import build_bundle_report
 
 
 class MLPlatformTest(unittest.TestCase):
@@ -108,6 +109,11 @@ class MLPlatformTest(unittest.TestCase):
         self.assertEqual("气泡", bundle["target"]["defect_type"])
         self.assertGreater(len(bundle["feature_columns"]), 0)
         self.assertIn("metrics", bundle)
+        self.assertIn("validation_metrics", bundle)
+        self.assertIn("feature_importance", bundle)
+        self.assertGreater(len(bundle["feature_importance"]), 0)
+        self.assertIn("feature", bundle["feature_importance"][0])
+        self.assertIn("importance", bundle["feature_importance"][0])
 
         path = "tmp_test_model_bundle.joblib"
         try:
@@ -139,6 +145,20 @@ class MLPlatformTest(unittest.TestCase):
         self.assertEqual(3, len(scored))
         self.assertIn("ml_prediction", scored.columns)
 
+    def test_bundle_report_excludes_model_object_and_keeps_audit_fields(self):
+        bundle = create_model_bundle(
+            self.df,
+            model_name="random_forest",
+            target_defect_type="气泡",
+        )
+
+        report = build_bundle_report(bundle)
+
+        self.assertNotIn("model", report)
+        self.assertIn("validation_metrics", report)
+        self.assertIn("feature_importance", report)
+        self.assertGreater(report["feature_count"], 0)
+
 
 if __name__ == "__main__":
     unittest.main()

+ 21 - 0
train_ml_models.py

@@ -1,6 +1,7 @@
 """训练和验证结构化 ML 模型。"""
 
 import argparse
+import json
 
 import pandas as pd
 
@@ -22,6 +23,21 @@ def load_defect_csv(csv_path):
     return normalize_defect_schema(pd.read_csv(csv_path, parse_dates=["timestamp"], encoding="utf-8-sig"))
 
 
+def build_bundle_report(bundle):
+    """生成可序列化的模型训练报告。"""
+    return {
+        "bundle_version": bundle["bundle_version"],
+        "created_at": bundle["created_at"],
+        "model_name": bundle["model_name"],
+        "target": bundle["target"],
+        "feature_count": len(bundle["feature_columns"]),
+        "metrics": bundle["metrics"],
+        "validation_metrics": bundle["validation_metrics"],
+        "feature_importance": bundle["feature_importance"],
+        "optional_backends": bundle["optional_backends"],
+    }
+
+
 def main():
     parser = argparse.ArgumentParser(description="训练/运行不良分析 ML 模型")
     parser.add_argument("--csv", default="defect_data.csv")
@@ -38,6 +54,7 @@ def main():
     parser.add_argument("--model-path", help="批量打分时加载的模型包路径")
     parser.add_argument("--predict-csv", help="使用已保存模型包对新 CSV 批量打分")
     parser.add_argument("--output-csv", help="批量打分结果导出路径,默认打印前 20 行")
+    parser.add_argument("--report-json", help="导出训练评估报告 JSON")
     args = parser.parse_args()
 
     if args.show_backends:
@@ -75,6 +92,10 @@ def main():
         save_model_bundle(bundle, args.save_model)
         result = {"metrics": bundle["metrics"]}
         print(f"模型包已保存: {args.save_model}")
+        if args.report_json:
+            with open(args.report_json, "w", encoding="utf-8") as f:
+                json.dump(build_bundle_report(bundle), f, ensure_ascii=False, indent=2)
+            print(f"训练评估报告已保存: {args.report_json}")
     else:
         X, y = build_supervised_dataset(
             df,