model_bundle.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. """可持久化的 ML 模型包。"""
  2. from datetime import datetime, timezone
  3. import joblib
  4. import pandas as pd
  5. from defect_analysis.ml.datasets import build_supervised_dataset
  6. from defect_analysis.ml.features import build_feature_frame
  7. from defect_analysis.ml.model_registry import detect_optional_model_backends
  8. from defect_analysis.ml.tabular_models import train_tabular_model
  9. from defect_analysis.schemas import normalize_defect_schema
  10. MODEL_BUNDLE_VERSION = 1
  11. def _target_config(target_defect_type=None, target_severity=None):
  12. return {
  13. "defect_type": target_defect_type,
  14. "severity": target_severity,
  15. "default": target_defect_type is None and target_severity is None,
  16. }
  17. def _align_features(features, feature_columns):
  18. """按训练时特征签名对齐新数据,避免 one-hot 列漂移导致推理失败。"""
  19. aligned = features.reindex(columns=feature_columns, fill_value=0.0)
  20. return aligned.astype(float)
  21. def create_model_bundle(
  22. df,
  23. *,
  24. model_name="random_forest",
  25. target_defect_type=None,
  26. target_severity=None,
  27. random_state=42,
  28. ):
  29. """训练并创建可保存的模型包。"""
  30. normalized = normalize_defect_schema(df)
  31. X, y = build_supervised_dataset(
  32. normalized,
  33. target_defect_type=target_defect_type,
  34. target_severity=target_severity,
  35. )
  36. if y.nunique() < 2:
  37. raise ValueError("目标标签只有一个类别,无法训练监督模型")
  38. trained = train_tabular_model(model_name, X, y, random_state=random_state)
  39. return {
  40. "bundle_version": MODEL_BUNDLE_VERSION,
  41. "created_at": datetime.now(timezone.utc).isoformat(),
  42. "model_name": model_name,
  43. "target": _target_config(target_defect_type, target_severity),
  44. "feature_columns": list(X.columns),
  45. "metrics": trained.get("metrics", {}),
  46. "optional_backends": detect_optional_model_backends(),
  47. "model": trained["model"],
  48. }
  49. def save_model_bundle(bundle, path):
  50. """保存模型包。"""
  51. joblib.dump(bundle, path)
  52. return path
  53. def load_model_bundle(path):
  54. """加载模型包。"""
  55. bundle = joblib.load(path)
  56. if bundle.get("bundle_version") != MODEL_BUNDLE_VERSION:
  57. raise ValueError("模型包版本不兼容")
  58. return bundle
  59. def predict_with_bundle(bundle, df):
  60. """使用模型包对新数据打分。"""
  61. normalized = normalize_defect_schema(df).reset_index(drop=True)
  62. features = build_feature_frame(normalized)
  63. X = _align_features(features, bundle["feature_columns"])
  64. model = bundle["model"]
  65. scored = normalized.copy()
  66. scored["ml_prediction"] = model.predict(X)
  67. if hasattr(model, "predict_proba"):
  68. scored["ml_probability"] = model.predict_proba(X)[:, 1]
  69. else:
  70. scored["ml_probability"] = pd.NA
  71. scored["model_name"] = bundle["model_name"]
  72. return scored