"""结构化 ML 特征工程。""" import pandas as pd from defect_analysis.schemas import normalize_defect_schema CATEGORICAL_FEATURES = [ "equipment_id", "seat_id", "lam_equipment_id", "lam_seat_id", "lam_fixture_id", "lam_nozzle_id", "material_lot_oca", "material_lot_glass", "clean_equipment_id", "bond_equipment_id", "shift", "defect_geometry_type", ] NUMERIC_FEATURES = [ "x_mm", "y_mm", "hour", "width_mm", "height_mm", "length_mm", "angle_deg", "area_mm2", ] def build_feature_frame(df): """把标准缺陷数据转换为可训练的数值特征矩阵。""" normalized = normalize_defect_schema(df) numeric = normalized[[col for col in NUMERIC_FEATURES if col in normalized.columns]].copy() for column in numeric.columns: numeric[column] = pd.to_numeric(numeric[column], errors="coerce").fillna(0.0) categorical = normalized[[col for col in CATEGORICAL_FEATURES if col in normalized.columns]].fillna("") encoded = pd.get_dummies(categorical.astype(str), prefix_sep="=", dtype=float) features = pd.concat([numeric.reset_index(drop=True), encoded.reset_index(drop=True)], axis=1) return features.fillna(0.0)