|
|
@@ -7,14 +7,34 @@ from sklearn.pipeline import make_pipeline
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
|
|
|
|
|
-def _classification_metrics(model, X, y):
|
|
|
+def classification_metrics(model, X, y, *, prefix="train"):
|
|
|
+ """计算二分类评估指标。"""
|
|
|
pred = model.predict(X)
|
|
|
- metrics = {"train_accuracy": float(accuracy_score(y, pred))}
|
|
|
+ metrics = {f"{prefix}_accuracy": float(accuracy_score(y, pred))}
|
|
|
if hasattr(model, "predict_proba") and len(set(y)) > 1:
|
|
|
- metrics["train_auc"] = float(roc_auc_score(y, model.predict_proba(X)[:, 1]))
|
|
|
+ metrics[f"{prefix}_auc"] = float(roc_auc_score(y, model.predict_proba(X)[:, 1]))
|
|
|
return metrics
|
|
|
|
|
|
|
|
|
+def extract_feature_importance(model, feature_columns, *, top_n=20):
|
|
|
+ """提取模型特征贡献,用于工程复盘和模型审计。"""
|
|
|
+ if hasattr(model, "feature_importances_"):
|
|
|
+ values = model.feature_importances_
|
|
|
+ elif hasattr(model, "named_steps") and "logisticregression" in model.named_steps:
|
|
|
+ values = abs(model.named_steps["logisticregression"].coef_[0])
|
|
|
+ elif hasattr(model, "coef_"):
|
|
|
+ values = abs(model.coef_[0])
|
|
|
+ else:
|
|
|
+ return []
|
|
|
+
|
|
|
+ rows = [
|
|
|
+ {"feature": feature, "importance": float(importance)}
|
|
|
+ for feature, importance in zip(feature_columns, values)
|
|
|
+ ]
|
|
|
+ rows.sort(key=lambda item: item["importance"], reverse=True)
|
|
|
+ return rows[:top_n]
|
|
|
+
|
|
|
+
|
|
|
def train_tabular_model(model_name, X, y=None, *, random_state=42):
|
|
|
"""训练表格模型。
|
|
|
|
|
|
@@ -31,7 +51,7 @@ def train_tabular_model(model_name, X, y=None, *, random_state=42):
|
|
|
class_weight="balanced",
|
|
|
)
|
|
|
model.fit(X, y)
|
|
|
- return {"model_name": model_name, "model": model, "metrics": _classification_metrics(model, X, y)}
|
|
|
+ return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
|
|
|
|
|
|
if model_name == "logistic_regression":
|
|
|
if y is None:
|
|
|
@@ -41,7 +61,7 @@ def train_tabular_model(model_name, X, y=None, *, random_state=42):
|
|
|
LogisticRegression(max_iter=3000, class_weight="balanced", solver="liblinear"),
|
|
|
)
|
|
|
model.fit(X, y)
|
|
|
- return {"model_name": model_name, "model": model, "metrics": _classification_metrics(model, X, y)}
|
|
|
+ return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
|
|
|
|
|
|
if model_name == "isolation_forest":
|
|
|
model = IsolationForest(n_estimators=100, contamination="auto", random_state=random_state)
|
|
|
@@ -64,7 +84,7 @@ def train_tabular_model(model_name, X, y=None, *, random_state=42):
|
|
|
random_state=random_state,
|
|
|
)
|
|
|
model.fit(X, y)
|
|
|
- return {"model_name": model_name, "model": model, "metrics": _classification_metrics(model, X, y)}
|
|
|
+ return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
|
|
|
|
|
|
if model_name == "lightgbm":
|
|
|
if y is None:
|
|
|
@@ -81,6 +101,6 @@ def train_tabular_model(model_name, X, y=None, *, random_state=42):
|
|
|
verbose=-1,
|
|
|
)
|
|
|
model.fit(X, y)
|
|
|
- return {"model_name": model_name, "model": model, "metrics": _classification_metrics(model, X, y)}
|
|
|
+ return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
|
|
|
|
|
|
raise ValueError(f"不支持的模型: {model_name}")
|