|
@@ -62,50 +62,58 @@ def find_key_factors(
|
|
|
columns=["维度", "因子值", "样本数", "目标数", "目标占比", "基线占比", "异常倍数", "支持度", "关键因子得分"]
|
|
columns=["维度", "因子值", "样本数", "目标数", "目标占比", "基线占比", "异常倍数", "支持度", "关键因子得分"]
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- rows = []
|
|
|
|
|
total = len(normalized)
|
|
total = len(normalized)
|
|
|
|
|
+ all_rows = []
|
|
|
for dimension in dimensions:
|
|
for dimension in dimensions:
|
|
|
if dimension not in normalized.columns:
|
|
if dimension not in normalized.columns:
|
|
|
continue
|
|
continue
|
|
|
- values = normalized[dimension].fillna("").astype(str)
|
|
|
|
|
- valid = normalized[values != ""].copy()
|
|
|
|
|
- if valid.empty:
|
|
|
|
|
|
|
+ series = normalized[dimension].fillna("").astype(str)
|
|
|
|
|
+ valid_idx = series != ""
|
|
|
|
|
+ if not valid_idx.any():
|
|
|
continue
|
|
continue
|
|
|
- valid_target = target_mask.loc[valid.index]
|
|
|
|
|
- grouped = valid.assign(_target=valid_target.astype(int)).groupby(dimension)
|
|
|
|
|
- for value, group in grouped:
|
|
|
|
|
- count = len(group)
|
|
|
|
|
- if count < min_count:
|
|
|
|
|
- continue
|
|
|
|
|
- target_count = int(group["_target"].sum())
|
|
|
|
|
- if target_count == 0:
|
|
|
|
|
- continue
|
|
|
|
|
- target_rate = target_count / count
|
|
|
|
|
- lift = target_rate / baseline_rate
|
|
|
|
|
- if lift < min_lift:
|
|
|
|
|
- continue
|
|
|
|
|
- support = count / total
|
|
|
|
|
- score = (max(lift - 1, 0) * 45) + (target_rate * 30) + (np.sqrt(target_count) * 8) + (support * 17)
|
|
|
|
|
- rows.append(
|
|
|
|
|
- {
|
|
|
|
|
- "维度": dimension,
|
|
|
|
|
- "因子值": str(value),
|
|
|
|
|
- "样本数": int(count),
|
|
|
|
|
- "目标数": target_count,
|
|
|
|
|
- "目标占比": round(float(target_rate), 4),
|
|
|
|
|
- "基线占比": round(float(baseline_rate), 4),
|
|
|
|
|
- "异常倍数": round(float(lift), 2),
|
|
|
|
|
- "支持度": round(float(support), 4),
|
|
|
|
|
- "关键因子得分": round(float(score), 2),
|
|
|
|
|
- }
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ valid = normalized.loc[valid_idx].copy()
|
|
|
|
|
+ valid["_target"] = target_mask.loc[valid.index].astype(int)
|
|
|
|
|
+ valid["_value"] = series.loc[valid_idx]
|
|
|
|
|
+ grouped = valid.groupby("_value").agg(
|
|
|
|
|
+ count=("defect_id", "count"),
|
|
|
|
|
+ target_count=("_target", "sum"),
|
|
|
|
|
+ )
|
|
|
|
|
+ grouped = grouped[grouped["count"] >= min_count]
|
|
|
|
|
+ grouped = grouped[grouped["target_count"] > 0]
|
|
|
|
|
+ if grouped.empty:
|
|
|
|
|
+ continue
|
|
|
|
|
+ grouped["target_rate"] = grouped["target_count"] / grouped["count"]
|
|
|
|
|
+ grouped["lift"] = grouped["target_rate"] / baseline_rate
|
|
|
|
|
+ grouped = grouped[grouped["lift"] >= min_lift]
|
|
|
|
|
+ if grouped.empty:
|
|
|
|
|
+ continue
|
|
|
|
|
+ grouped["support"] = grouped["count"] / total
|
|
|
|
|
+ grouped["score"] = (
|
|
|
|
|
+ (grouped["lift"] - 1).clip(lower=0) * 45
|
|
|
|
|
+ + grouped["target_rate"] * 30
|
|
|
|
|
+ + np.sqrt(grouped["target_count"]) * 8
|
|
|
|
|
+ + grouped["support"] * 17
|
|
|
|
|
+ )
|
|
|
|
|
+ grouped = grouped.reset_index()
|
|
|
|
|
+ grouped = grouped.rename(columns={"_value": "因子值"})
|
|
|
|
|
+ grouped["维度"] = dimension
|
|
|
|
|
+ grouped["样本数"] = grouped["count"].astype(int)
|
|
|
|
|
+ grouped["目标数"] = grouped["target_count"].astype(int)
|
|
|
|
|
+ grouped["目标占比"] = grouped["target_rate"].round(4)
|
|
|
|
|
+ grouped["基线占比"] = round(baseline_rate, 4)
|
|
|
|
|
+ grouped["异常倍数"] = grouped["lift"].round(2)
|
|
|
|
|
+ grouped["支持度"] = grouped["support"].round(4)
|
|
|
|
|
+ grouped["关键因子得分"] = grouped["score"].round(2)
|
|
|
|
|
+ all_rows.append(
|
|
|
|
|
+ grouped[["维度", "因子值", "样本数", "目标数", "目标占比", "基线占比", "异常倍数", "支持度", "关键因子得分"]]
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
- if not rows:
|
|
|
|
|
|
|
+ if not all_rows:
|
|
|
return pd.DataFrame(
|
|
return pd.DataFrame(
|
|
|
columns=["维度", "因子值", "样本数", "目标数", "目标占比", "基线占比", "异常倍数", "支持度", "关键因子得分"]
|
|
columns=["维度", "因子值", "样本数", "目标数", "目标占比", "基线占比", "异常倍数", "支持度", "关键因子得分"]
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- result = pd.DataFrame(rows)
|
|
|
|
|
|
|
+ result = pd.concat(all_rows, ignore_index=True)
|
|
|
return (
|
|
return (
|
|
|
result.sort_values(["关键因子得分", "目标数", "异常倍数"], ascending=False)
|
|
result.sort_values(["关键因子得分", "目标数", "异常倍数"], ascending=False)
|
|
|
.head(top_n)
|
|
.head(top_n)
|