Ver código fonte

增强:支持模型包保存与批量打分

DESKTOP-74CLTRG\Leol 4 dias atrás
pai
commit
cfc81ff3dd
4 arquivos alterados com 181 adições e 6 exclusões
  1. 91 0
      defect_analysis/ml/model_bundle.py
  2. 1 0
      requirements.txt
  3. 48 0
      tests/test_ml_platform.py
  4. 41 6
      train_ml_models.py

+ 91 - 0
defect_analysis/ml/model_bundle.py

@@ -0,0 +1,91 @@
+"""可持久化的 ML 模型包。"""
+
+from datetime import datetime, timezone
+
+import joblib
+import pandas as pd
+
+from defect_analysis.ml.datasets import build_supervised_dataset
+from defect_analysis.ml.features import build_feature_frame
+from defect_analysis.ml.model_registry import detect_optional_model_backends
+from defect_analysis.ml.tabular_models import train_tabular_model
+from defect_analysis.schemas import normalize_defect_schema
+
+
+MODEL_BUNDLE_VERSION = 1
+
+
+def _target_config(target_defect_type=None, target_severity=None):
+    return {
+        "defect_type": target_defect_type,
+        "severity": target_severity,
+        "default": target_defect_type is None and target_severity is None,
+    }
+
+
+def _align_features(features, feature_columns):
+    """按训练时特征签名对齐新数据,避免 one-hot 列漂移导致推理失败。"""
+    aligned = features.reindex(columns=feature_columns, fill_value=0.0)
+    return aligned.astype(float)
+
+
+def create_model_bundle(
+    df,
+    *,
+    model_name="random_forest",
+    target_defect_type=None,
+    target_severity=None,
+    random_state=42,
+):
+    """训练并创建可保存的模型包。"""
+    normalized = normalize_defect_schema(df)
+    X, y = build_supervised_dataset(
+        normalized,
+        target_defect_type=target_defect_type,
+        target_severity=target_severity,
+    )
+    if y.nunique() < 2:
+        raise ValueError("目标标签只有一个类别,无法训练监督模型")
+
+    trained = train_tabular_model(model_name, X, y, random_state=random_state)
+    return {
+        "bundle_version": MODEL_BUNDLE_VERSION,
+        "created_at": datetime.now(timezone.utc).isoformat(),
+        "model_name": model_name,
+        "target": _target_config(target_defect_type, target_severity),
+        "feature_columns": list(X.columns),
+        "metrics": trained.get("metrics", {}),
+        "optional_backends": detect_optional_model_backends(),
+        "model": trained["model"],
+    }
+
+
+def save_model_bundle(bundle, path):
+    """保存模型包。"""
+    joblib.dump(bundle, path)
+    return path
+
+
+def load_model_bundle(path):
+    """加载模型包。"""
+    bundle = joblib.load(path)
+    if bundle.get("bundle_version") != MODEL_BUNDLE_VERSION:
+        raise ValueError("模型包版本不兼容")
+    return bundle
+
+
+def predict_with_bundle(bundle, df):
+    """使用模型包对新数据打分。"""
+    normalized = normalize_defect_schema(df).reset_index(drop=True)
+    features = build_feature_frame(normalized)
+    X = _align_features(features, bundle["feature_columns"])
+    model = bundle["model"]
+
+    scored = normalized.copy()
+    scored["ml_prediction"] = model.predict(X)
+    if hasattr(model, "predict_proba"):
+        scored["ml_probability"] = model.predict_proba(X)[:, 1]
+    else:
+        scored["ml_probability"] = pd.NA
+    scored["model_name"] = bundle["model_name"]
+    return scored

+ 1 - 0
requirements.txt

@@ -4,4 +4,5 @@ numpy
 matplotlib
 seaborn
 scikit-learn
+joblib
 plotly

+ 48 - 0
tests/test_ml_platform.py

@@ -5,6 +5,12 @@ import pandas as pd
 from defect_analysis.ml.datasets import build_supervised_dataset
 from defect_analysis.ml.features import build_feature_frame
 from defect_analysis.ml.image_models import ImageModelUnavailable, ImageModelWrapper
+from defect_analysis.ml.model_bundle import (
+    create_model_bundle,
+    load_model_bundle,
+    predict_with_bundle,
+    save_model_bundle,
+)
 from defect_analysis.ml.model_registry import detect_optional_model_backends
 from defect_analysis.ml.predict import predict_key_factors
 from defect_analysis.ml.tabular_models import train_tabular_model
@@ -91,6 +97,48 @@ class MLPlatformTest(unittest.TestCase):
         with self.assertRaises(ImageModelUnavailable):
             wrapper.predict([])
 
+    def test_model_bundle_can_be_saved_loaded_and_score_new_data(self):
+        bundle = create_model_bundle(
+            self.df,
+            model_name="random_forest",
+            target_defect_type="气泡",
+        )
+
+        self.assertEqual("random_forest", bundle["model_name"])
+        self.assertEqual("气泡", bundle["target"]["defect_type"])
+        self.assertGreater(len(bundle["feature_columns"]), 0)
+        self.assertIn("metrics", bundle)
+
+        path = "tmp_test_model_bundle.joblib"
+        try:
+            save_model_bundle(bundle, path)
+            loaded = load_model_bundle(path)
+            scored = predict_with_bundle(loaded, self.df.tail(5))
+        finally:
+            import os
+
+            if os.path.exists(path):
+                os.remove(path)
+
+        self.assertEqual(5, len(scored))
+        self.assertIn("ml_probability", scored.columns)
+        self.assertTrue(scored["ml_probability"].between(0, 1).all())
+
+    def test_model_bundle_aligns_missing_feature_columns_for_new_data(self):
+        bundle = create_model_bundle(
+            self.df,
+            model_name="logistic_regression",
+            target_defect_type="气泡",
+        )
+        new_df = self.df.tail(3).copy()
+        new_df["equipment_id"] = "NEW-LAM"
+        new_df["seat_id"] = "NEW-SEAT"
+
+        scored = predict_with_bundle(bundle, new_df)
+
+        self.assertEqual(3, len(scored))
+        self.assertIn("ml_prediction", scored.columns)
+
 
 if __name__ == "__main__":
     unittest.main()

+ 41 - 6
train_ml_models.py

@@ -6,6 +6,12 @@ import pandas as pd
 
 from defect_analysis.ml.datasets import build_supervised_dataset
 from defect_analysis.ml.features import build_feature_frame
+from defect_analysis.ml.model_bundle import (
+    create_model_bundle,
+    load_model_bundle,
+    predict_with_bundle,
+    save_model_bundle,
+)
 from defect_analysis.ml.model_registry import detect_optional_model_backends
 from defect_analysis.ml.predict import predict_key_factors
 from defect_analysis.ml.tabular_models import train_tabular_model
@@ -28,11 +34,29 @@ def main():
     parser.add_argument("--target-severity")
     parser.add_argument("--top-n", type=int, default=10)
     parser.add_argument("--show-backends", action="store_true")
+    parser.add_argument("--save-model", help="训练后保存监督模型包到指定路径,仅支持监督模型")
+    parser.add_argument("--model-path", help="批量打分时加载的模型包路径")
+    parser.add_argument("--predict-csv", help="使用已保存模型包对新 CSV 批量打分")
+    parser.add_argument("--output-csv", help="批量打分结果导出路径,默认打印前 20 行")
     args = parser.parse_args()
 
     if args.show_backends:
         print(detect_optional_model_backends())
 
+    if args.predict_csv:
+        model_path = args.model_path or args.save_model
+        if not model_path:
+            raise SystemExit("--predict-csv 需要通过 --model-path 指定已保存的模型包路径")
+        bundle = load_model_bundle(model_path)
+        scored = predict_with_bundle(bundle, load_defect_csv(args.predict_csv))
+        if args.output_csv:
+            scored.to_csv(args.output_csv, index=False, encoding="utf-8-sig")
+            print(f"批量打分完成: {args.output_csv},样本数={len(scored)}")
+        else:
+            columns = ["defect_id", "panel_id", "defect_type", "severity", "ml_prediction", "ml_probability", "model_name"]
+            print(scored[[col for col in columns if col in scored.columns]].head(20).to_string(index=False))
+        return
+
     df = load_defect_csv(args.csv)
     if args.model == "isolation_forest":
         X = build_feature_frame(df)
@@ -41,12 +65,23 @@ def main():
         print(f"IsolationForest 完成: 样本数={len(scores)}, 最高异常分={scores.max():.4f}, 平均异常分={scores.mean():.4f}")
         return
 
-    X, y = build_supervised_dataset(
-        df,
-        target_defect_type=args.target_defect_type,
-        target_severity=args.target_severity,
-    )
-    result = train_tabular_model(args.model, X, y)
+    if args.save_model:
+        bundle = create_model_bundle(
+            df,
+            model_name=args.model,
+            target_defect_type=args.target_defect_type,
+            target_severity=args.target_severity,
+        )
+        save_model_bundle(bundle, args.save_model)
+        result = {"metrics": bundle["metrics"]}
+        print(f"模型包已保存: {args.save_model}")
+    else:
+        X, y = build_supervised_dataset(
+            df,
+            target_defect_type=args.target_defect_type,
+            target_severity=args.target_severity,
+        )
+        result = train_tabular_model(args.model, X, y)
     print(f"{args.model} 训练完成: {result['metrics']}")
 
     predictions = predict_key_factors(