train_ml_models.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. """训练和验证结构化 ML 模型。"""
  2. import argparse
  3. import pandas as pd
  4. from defect_analysis.ml.datasets import build_supervised_dataset
  5. from defect_analysis.ml.features import build_feature_frame
  6. from defect_analysis.ml.model_bundle import (
  7. create_model_bundle,
  8. load_model_bundle,
  9. predict_with_bundle,
  10. save_model_bundle,
  11. )
  12. from defect_analysis.ml.model_registry import detect_optional_model_backends
  13. from defect_analysis.ml.predict import predict_key_factors
  14. from defect_analysis.ml.tabular_models import train_tabular_model
  15. from defect_analysis.schemas import normalize_defect_schema
  16. def load_defect_csv(csv_path):
  17. return normalize_defect_schema(pd.read_csv(csv_path, parse_dates=["timestamp"], encoding="utf-8-sig"))
  18. def main():
  19. parser = argparse.ArgumentParser(description="训练/运行不良分析 ML 模型")
  20. parser.add_argument("--csv", default="defect_data.csv")
  21. parser.add_argument(
  22. "--model",
  23. default="random_forest",
  24. choices=["random_forest", "logistic_regression", "isolation_forest", "xgboost", "lightgbm"],
  25. )
  26. parser.add_argument("--target-defect-type")
  27. parser.add_argument("--target-severity")
  28. parser.add_argument("--top-n", type=int, default=10)
  29. parser.add_argument("--show-backends", action="store_true")
  30. parser.add_argument("--save-model", help="训练后保存监督模型包到指定路径,仅支持监督模型")
  31. parser.add_argument("--model-path", help="批量打分时加载的模型包路径")
  32. parser.add_argument("--predict-csv", help="使用已保存模型包对新 CSV 批量打分")
  33. parser.add_argument("--output-csv", help="批量打分结果导出路径,默认打印前 20 行")
  34. args = parser.parse_args()
  35. if args.show_backends:
  36. print(detect_optional_model_backends())
  37. if args.predict_csv:
  38. model_path = args.model_path or args.save_model
  39. if not model_path:
  40. raise SystemExit("--predict-csv 需要通过 --model-path 指定已保存的模型包路径")
  41. bundle = load_model_bundle(model_path)
  42. scored = predict_with_bundle(bundle, load_defect_csv(args.predict_csv))
  43. if args.output_csv:
  44. scored.to_csv(args.output_csv, index=False, encoding="utf-8-sig")
  45. print(f"批量打分完成: {args.output_csv},样本数={len(scored)}")
  46. else:
  47. columns = ["defect_id", "panel_id", "defect_type", "severity", "ml_prediction", "ml_probability", "model_name"]
  48. print(scored[[col for col in columns if col in scored.columns]].head(20).to_string(index=False))
  49. return
  50. df = load_defect_csv(args.csv)
  51. if args.model == "isolation_forest":
  52. X = build_feature_frame(df)
  53. result = train_tabular_model("isolation_forest", X)
  54. scores = pd.Series(result["anomaly_scores"])
  55. print(f"IsolationForest 完成: 样本数={len(scores)}, 最高异常分={scores.max():.4f}, 平均异常分={scores.mean():.4f}")
  56. return
  57. if args.save_model:
  58. bundle = create_model_bundle(
  59. df,
  60. model_name=args.model,
  61. target_defect_type=args.target_defect_type,
  62. target_severity=args.target_severity,
  63. )
  64. save_model_bundle(bundle, args.save_model)
  65. result = {"metrics": bundle["metrics"]}
  66. print(f"模型包已保存: {args.save_model}")
  67. else:
  68. X, y = build_supervised_dataset(
  69. df,
  70. target_defect_type=args.target_defect_type,
  71. target_severity=args.target_severity,
  72. )
  73. result = train_tabular_model(args.model, X, y)
  74. print(f"{args.model} 训练完成: {result['metrics']}")
  75. predictions = predict_key_factors(
  76. df,
  77. target_defect_type=args.target_defect_type,
  78. target_severity=args.target_severity,
  79. model_name=args.model,
  80. top_n=args.top_n,
  81. )
  82. if predictions.empty:
  83. print("未找到关键因子候选。")
  84. else:
  85. columns = ["维度", "因子值", "目标数", "异常倍数", "关键因子得分", "ml_probability", "model_name"]
  86. print(predictions[columns].to_string(index=False))
  87. if __name__ == "__main__":
  88. main()