predict.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. """统一 ML 推理入口。"""
  2. import pandas as pd
  3. from defect_analysis.ml.datasets import build_supervised_dataset
  4. from defect_analysis.ml.key_factors import find_key_factors
  5. from defect_analysis.ml.tabular_models import train_tabular_model
  6. def predict_key_factors(df, *, target_defect_type=None, target_severity=None, model_name="random_forest", top_n=20):
  7. """用统计关键因子 + 监督模型概率输出候选关键因子。"""
  8. key_factors = find_key_factors(
  9. df,
  10. target_defect_type=target_defect_type,
  11. target_severity=target_severity,
  12. top_n=top_n,
  13. )
  14. if key_factors.empty:
  15. return key_factors
  16. X, y = build_supervised_dataset(
  17. df,
  18. target_defect_type=target_defect_type,
  19. target_severity=target_severity,
  20. )
  21. if y.nunique() < 2:
  22. key_factors["ml_probability"] = 0.0
  23. key_factors["model_name"] = model_name
  24. return key_factors
  25. trained = train_tabular_model(model_name, X, y)
  26. model = trained["model"]
  27. probabilities = pd.Series(model.predict_proba(X)[:, 1], index=X.index)
  28. scored = key_factors.copy()
  29. # 向量化:把 key_factors 的维度/因子值映射为 one-hot 列名后取概率均值
  30. dimension = scored["维度"].astype(str)
  31. value = scored["因子值"].astype(str)
  32. column_names = dimension + "=" + value
  33. ml_scores = []
  34. for col in column_names:
  35. if col in X.columns:
  36. ml_scores.append(float(probabilities.loc[X[col] == 1].mean()) if X[col].any() else 0.0)
  37. else:
  38. ml_scores.append(0.0)
  39. scored["ml_probability"] = ml_scores
  40. scored["model_name"] = model_name
  41. return scored.sort_values(["ml_probability", "关键因子得分"], ascending=False).reset_index(drop=True)