| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106 |
- """表格模型训练入口。"""
- from sklearn.ensemble import IsolationForest, RandomForestClassifier
- from sklearn.linear_model import LogisticRegression
- from sklearn.metrics import accuracy_score, roc_auc_score
- from sklearn.pipeline import make_pipeline
- from sklearn.preprocessing import StandardScaler
- def classification_metrics(model, X, y, *, prefix="train"):
- """计算二分类评估指标。"""
- pred = model.predict(X)
- metrics = {f"{prefix}_accuracy": float(accuracy_score(y, pred))}
- if hasattr(model, "predict_proba") and len(set(y)) > 1:
- metrics[f"{prefix}_auc"] = float(roc_auc_score(y, model.predict_proba(X)[:, 1]))
- return metrics
- def extract_feature_importance(model, feature_columns, *, top_n=20):
- """提取模型特征贡献,用于工程复盘和模型审计。"""
- if hasattr(model, "feature_importances_"):
- values = model.feature_importances_
- elif hasattr(model, "named_steps") and "logisticregression" in model.named_steps:
- values = abs(model.named_steps["logisticregression"].coef_[0])
- elif hasattr(model, "coef_"):
- values = abs(model.coef_[0])
- else:
- return []
- rows = [
- {"feature": feature, "importance": float(importance)}
- for feature, importance in zip(feature_columns, values)
- ]
- rows.sort(key=lambda item: item["importance"], reverse=True)
- return rows[:top_n]
- def train_tabular_model(model_name, X, y=None, *, random_state=42):
- """训练表格模型。
- 支持 random_forest、logistic_regression、isolation_forest。
- """
- if model_name == "random_forest":
- if y is None:
- raise ValueError("random_forest 需要监督标签 y")
- model = RandomForestClassifier(
- n_estimators=100,
- max_depth=8,
- min_samples_leaf=2,
- random_state=random_state,
- class_weight="balanced",
- )
- model.fit(X, y)
- return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
- if model_name == "logistic_regression":
- if y is None:
- raise ValueError("logistic_regression 需要监督标签 y")
- model = make_pipeline(
- StandardScaler(with_mean=False),
- LogisticRegression(max_iter=3000, class_weight="balanced", solver="liblinear"),
- )
- model.fit(X, y)
- return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
- if model_name == "isolation_forest":
- model = IsolationForest(n_estimators=100, contamination="auto", random_state=random_state)
- model.fit(X)
- scores = -model.decision_function(X)
- return {"model_name": model_name, "model": model, "anomaly_scores": scores}
- if model_name == "xgboost":
- if y is None:
- raise ValueError("xgboost 需要监督标签 y")
- try:
- from xgboost import XGBClassifier
- except ImportError as exc:
- raise RuntimeError("XGBoost 未安装,请安装 xgboost 后再启用该模型") from exc
- model = XGBClassifier(
- n_estimators=100,
- max_depth=4,
- learning_rate=0.08,
- eval_metric="logloss",
- random_state=random_state,
- )
- model.fit(X, y)
- return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
- if model_name == "lightgbm":
- if y is None:
- raise ValueError("lightgbm 需要监督标签 y")
- try:
- from lightgbm import LGBMClassifier
- except ImportError as exc:
- raise RuntimeError("LightGBM 未安装,请安装 lightgbm 后再启用该模型") from exc
- model = LGBMClassifier(
- n_estimators=100,
- max_depth=4,
- learning_rate=0.08,
- random_state=random_state,
- verbose=-1,
- )
- model.fit(X, y)
- return {"model_name": model_name, "model": model, "metrics": classification_metrics(model, X, y)}
- raise ValueError(f"不支持的模型: {model_name}")
|