2 tháng trước cách đây · 385dd625d7
--- a/defect_analysis/ml/key_factors.py
+++ b/defect_analysis/ml/key_factors.py
@@ -0,0 +1,113 @@
 
				+"""不良关键因子发现。"""
			
 
				+
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+
			
 
				+from defect_analysis.schemas import normalize_defect_schema
			
 
				+
			
 
				+
			
 
				+DEFAULT_FACTOR_DIMENSIONS = [
			
 
				+    "equipment_id",
			
 
				+    "seat_id",
			
 
				+    "lam_equipment_id",
			
 
				+    "lam_seat_id",
			
 
				+    "lam_fixture_id",
			
 
				+    "lam_jig_id",
			
 
				+    "lam_nozzle_id",
			
 
				+    "material_lot_oca",
			
 
				+    "material_lot_glass",
			
 
				+    "material_lot_polarizer",
			
 
				+    "clean_equipment_id",
			
 
				+    "clean_slot_id",
			
 
				+    "bond_equipment_id",
			
 
				+    "bond_head_id",
			
 
				+    "recipe_id",
			
 
				+    "shift",
			
 
				+    "defect_geometry_type",
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def _build_target_mask(df, target_defect_type=None, target_severity=None):
			
 
				+    if target_defect_type:
			
 
				+        return df["defect_type"] == target_defect_type
			
 
				+    if target_severity:
			
 
				+        return df["severity"] == target_severity
			
 
				+    return df["severity"] == "严重"
			
 
				+
			
 
				+
			
 
				+def find_key_factors(
			
 
				+    df,
			
 
				+    *,
			
 
				+    target_defect_type=None,
			
 
				+    target_severity=None,
			
 
				+    dimensions=None,
			
 
				+    min_count=3,
			
 
				+    min_lift=1.1,
			
 
				+    top_n=20,
			
 
				+):
			
 
				+    """查找与目标不良显著相关的关键因子。
			
 
				+
			
 
				+    当前实现是可解释统计排序：按每个维度取值计算目标占比、异常倍数和支持度，
			
 
				+    用综合得分排序。它适合生产早期作为 ML 特征与根因候选的基线。
			
 
				+    """
			
 
				+    normalized = normalize_defect_schema(df)
			
 
				+    if normalized.empty:
			
 
				+        return pd.DataFrame()
			
 
				+
			
 
				+    dimensions = DEFAULT_FACTOR_DIMENSIONS if dimensions is None else dimensions
			
 
				+    target_mask = _build_target_mask(normalized, target_defect_type, target_severity)
			
 
				+    baseline_rate = float(target_mask.mean())
			
 
				+    if baseline_rate <= 0:
			
 
				+        return pd.DataFrame(
			
 
				+            columns=["维度", "因子值", "样本数", "目标数", "目标占比", "基线占比", "异常倍数", "支持度", "关键因子得分"]
			
 
				+        )
			
 
				+
			
 
				+    rows = []
			
 
				+    total = len(normalized)
			
 
				+    for dimension in dimensions:
			
 
				+        if dimension not in normalized.columns:
			
 
				+            continue
			
 
				+        values = normalized[dimension].fillna("").astype(str)
			
 
				+        valid = normalized[values != ""].copy()
			
 
				+        if valid.empty:
			
 
				+            continue
			
 
				+        valid_target = target_mask.loc[valid.index]
			
 
				+        grouped = valid.assign(_target=valid_target.astype(int)).groupby(dimension)
			
 
				+        for value, group in grouped:
			
 
				+            count = len(group)
			
 
				+            if count < min_count:
			
 
				+                continue
			
 
				+            target_count = int(group["_target"].sum())
			
 
				+            if target_count == 0:
			
 
				+                continue
			
 
				+            target_rate = target_count / count
			
 
				+            lift = target_rate / baseline_rate
			
 
				+            if lift < min_lift:
			
 
				+                continue
			
 
				+            support = count / total
			
 
				+            score = (max(lift - 1, 0) * 45) + (target_rate * 30) + (np.sqrt(target_count) * 8) + (support * 17)
			
 
				+            rows.append(
			
 
				+                {
			
 
				+                    "维度": dimension,
			
 
				+                    "因子值": str(value),
			
 
				+                    "样本数": int(count),
			
 
				+                    "目标数": target_count,
			
 
				+                    "目标占比": round(float(target_rate), 4),
			
 
				+                    "基线占比": round(float(baseline_rate), 4),
			
 
				+                    "异常倍数": round(float(lift), 2),
			
 
				+                    "支持度": round(float(support), 4),
			
 
				+                    "关键因子得分": round(float(score), 2),
			
 
				+                }
			
 
				+            )
			
 
				+
			
 
				+    if not rows:
			
 
				+        return pd.DataFrame(
			
 
				+            columns=["维度", "因子值", "样本数", "目标数", "目标占比", "基线占比", "异常倍数", "支持度", "关键因子得分"]
			
 
				+        )
			
 
				+
			
 
				+    result = pd.DataFrame(rows)
			
 
				+    return (
			
 
				+        result.sort_values(["关键因子得分", "目标数", "异常倍数"], ascending=False)
			
 
				+        .head(top_n)
			
 
				+        .reset_index(drop=True)
			
 
				+    )