key_factors.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. """不良关键因子发现。"""
  2. import numpy as np
  3. import pandas as pd
  4. from defect_analysis.schemas import normalize_defect_schema
  5. DEFAULT_FACTOR_DIMENSIONS = [
  6. "equipment_id",
  7. "seat_id",
  8. "lam_equipment_id",
  9. "lam_seat_id",
  10. "lam_fixture_id",
  11. "lam_jig_id",
  12. "lam_nozzle_id",
  13. "material_lot_oca",
  14. "material_lot_glass",
  15. "material_lot_polarizer",
  16. "clean_equipment_id",
  17. "clean_slot_id",
  18. "bond_equipment_id",
  19. "bond_head_id",
  20. "recipe_id",
  21. "shift",
  22. "defect_geometry_type",
  23. ]
  24. def _build_target_mask(df, target_defect_type=None, target_severity=None):
  25. if target_defect_type:
  26. return df["defect_type"] == target_defect_type
  27. if target_severity:
  28. return df["severity"] == target_severity
  29. return df["severity"] == "严重"
  30. def find_key_factors(
  31. df,
  32. *,
  33. target_defect_type=None,
  34. target_severity=None,
  35. dimensions=None,
  36. min_count=3,
  37. min_lift=1.1,
  38. top_n=20,
  39. ):
  40. """查找与目标不良显著相关的关键因子。
  41. 当前实现是可解释统计排序:按每个维度取值计算目标占比、异常倍数和支持度,
  42. 用综合得分排序。它适合生产早期作为 ML 特征与根因候选的基线。
  43. """
  44. normalized = normalize_defect_schema(df)
  45. if normalized.empty:
  46. return pd.DataFrame()
  47. dimensions = DEFAULT_FACTOR_DIMENSIONS if dimensions is None else dimensions
  48. target_mask = _build_target_mask(normalized, target_defect_type, target_severity)
  49. baseline_rate = float(target_mask.mean())
  50. if baseline_rate <= 0:
  51. return pd.DataFrame(
  52. columns=["维度", "因子值", "样本数", "目标数", "目标占比", "基线占比", "异常倍数", "支持度", "关键因子得分"]
  53. )
  54. total = len(normalized)
  55. all_rows = []
  56. for dimension in dimensions:
  57. if dimension not in normalized.columns:
  58. continue
  59. series = normalized[dimension].fillna("").astype(str)
  60. valid_idx = series != ""
  61. if not valid_idx.any():
  62. continue
  63. valid = normalized.loc[valid_idx].copy()
  64. valid["_target"] = target_mask.loc[valid.index].astype(int)
  65. valid["_value"] = series.loc[valid_idx]
  66. grouped = valid.groupby("_value").agg(
  67. count=("defect_id", "count"),
  68. target_count=("_target", "sum"),
  69. )
  70. grouped = grouped[grouped["count"] >= min_count]
  71. grouped = grouped[grouped["target_count"] > 0]
  72. if grouped.empty:
  73. continue
  74. grouped["target_rate"] = grouped["target_count"] / grouped["count"]
  75. grouped["lift"] = grouped["target_rate"] / baseline_rate
  76. grouped = grouped[grouped["lift"] >= min_lift]
  77. if grouped.empty:
  78. continue
  79. grouped["support"] = grouped["count"] / total
  80. grouped["score"] = (
  81. (grouped["lift"] - 1).clip(lower=0) * 45
  82. + grouped["target_rate"] * 30
  83. + np.sqrt(grouped["target_count"]) * 8
  84. + grouped["support"] * 17
  85. )
  86. grouped = grouped.reset_index()
  87. grouped = grouped.rename(columns={"_value": "因子值"})
  88. grouped["维度"] = dimension
  89. grouped["样本数"] = grouped["count"].astype(int)
  90. grouped["目标数"] = grouped["target_count"].astype(int)
  91. grouped["目标占比"] = grouped["target_rate"].round(4)
  92. grouped["基线占比"] = round(baseline_rate, 4)
  93. grouped["异常倍数"] = grouped["lift"].round(2)
  94. grouped["支持度"] = grouped["support"].round(4)
  95. grouped["关键因子得分"] = grouped["score"].round(2)
  96. all_rows.append(
  97. grouped[["维度", "因子值", "样本数", "目标数", "目标占比", "基线占比", "异常倍数", "支持度", "关键因子得分"]]
  98. )
  99. if not all_rows:
  100. return pd.DataFrame(
  101. columns=["维度", "因子值", "样本数", "目标数", "目标占比", "基线占比", "异常倍数", "支持度", "关键因子得分"]
  102. )
  103. result = pd.concat(all_rows, ignore_index=True)
  104. return (
  105. result.sort_values(["关键因子得分", "目标数", "异常倍数"], ascending=False)
  106. .head(top_n)
  107. .reset_index(drop=True)
  108. )