features.py 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. """结构化 ML 特征工程。"""
  2. import pandas as pd
  3. from defect_analysis.schemas import normalize_defect_schema
  4. CATEGORICAL_FEATURES = [
  5. "equipment_id",
  6. "seat_id",
  7. "lam_equipment_id",
  8. "lam_seat_id",
  9. "lam_fixture_id",
  10. "lam_nozzle_id",
  11. "material_lot_oca",
  12. "material_lot_glass",
  13. "clean_equipment_id",
  14. "bond_equipment_id",
  15. "shift",
  16. "defect_geometry_type",
  17. ]
  18. NUMERIC_FEATURES = [
  19. "x_mm",
  20. "y_mm",
  21. "hour",
  22. "width_mm",
  23. "height_mm",
  24. "length_mm",
  25. "angle_deg",
  26. "area_mm2",
  27. ]
  28. def build_feature_frame(df):
  29. """把标准缺陷数据转换为可训练的数值特征矩阵。"""
  30. normalized = normalize_defect_schema(df)
  31. numeric = normalized[[col for col in NUMERIC_FEATURES if col in normalized.columns]].copy()
  32. for column in numeric.columns:
  33. numeric[column] = pd.to_numeric(numeric[column], errors="coerce").fillna(0.0)
  34. categorical = normalized[[col for col in CATEGORICAL_FEATURES if col in normalized.columns]].fillna("")
  35. encoded = pd.get_dummies(categorical.astype(str), prefix_sep="=", dtype=float)
  36. features = pd.concat([numeric.reset_index(drop=True), encoded.reset_index(drop=True)], axis=1)
  37. return features.fillna(0.0)