import unittest import pandas as pd from defect_analysis.ml.datasets import build_supervised_dataset from defect_analysis.ml.features import build_feature_frame from defect_analysis.ml.image_models import ImageModelUnavailable, ImageModelWrapper from defect_analysis.ml.model_bundle import ( create_model_bundle, load_model_bundle, predict_with_bundle, save_model_bundle, ) from defect_analysis.ml.model_registry import detect_optional_model_backends from defect_analysis.ml.predict import predict_key_factors from defect_analysis.ml.tabular_models import train_tabular_model from defect_analysis.schemas import normalize_defect_schema from train_ml_models import build_bundle_report class MLPlatformTest(unittest.TestCase): def setUp(self): rows = [] for i in range(40): hot = i < 24 rows.append( { "defect_id": f"D{i}", "panel_id": f"P{i}", "batch_id": "B1", "equipment_id": "LAM-A01" if hot else "LAM-B01", "seat_id": "R1C1" if hot else "R2C2", "inspection_station": "AOI-1", "timestamp": pd.Timestamp("2026-04-01 08:00:00"), "defect_type": "气泡" if hot else "划痕", "severity": "严重" if i % 5 == 0 else "轻微", "x_mm": 10.0 + i, "y_mm": 20.0, "panel_width_mm": 155.0, "panel_height_mm": 340.0, "hour": 8, "shift": "白班", "day": "2026-04-01", "lam_fixture_id": "FIX-HOT" if hot else "FIX-OK", "material_lot_oca": "OCA-HOT" if hot else "OCA-OK", } ) self.df = normalize_defect_schema(pd.DataFrame(rows)) def test_build_feature_frame_creates_numeric_matrix(self): features = build_feature_frame(self.df) self.assertEqual(len(self.df), len(features)) self.assertTrue(all(dtype.kind in "biufc" for dtype in features.dtypes)) self.assertTrue(any(col.startswith("equipment_id=") for col in features.columns)) def test_build_supervised_dataset_targets_defect_type(self): X, y = build_supervised_dataset(self.df, target_defect_type="气泡") self.assertEqual(len(self.df), len(X)) self.assertEqual(24, int(y.sum())) def test_train_random_forest_and_logistic_regression(self): X, y = build_supervised_dataset(self.df, target_defect_type="气泡") rf = train_tabular_model("random_forest", X, y) lr = train_tabular_model("logistic_regression", X, y) self.assertIn("model", rf) self.assertIn("metrics", rf) self.assertIn("model", lr) self.assertGreaterEqual(rf["metrics"]["train_accuracy"], 0.5) def test_train_isolation_forest_outputs_anomaly_scores(self): X = build_feature_frame(self.df) result = train_tabular_model("isolation_forest", X) self.assertIn("anomaly_scores", result) self.assertEqual(len(self.df), len(result["anomaly_scores"])) def test_predict_key_factors_returns_model_scores(self): predictions = predict_key_factors(self.df, target_defect_type="气泡") self.assertFalse(predictions.empty) self.assertIn("ml_probability", predictions.columns) self.assertIn("model_name", predictions.columns) def test_optional_backends_are_reported_without_import_failure(self): backends = detect_optional_model_backends() self.assertIn("xgboost", backends) self.assertIn("lightgbm", backends) def test_image_model_wrapper_is_explicitly_unavailable_without_backend(self): wrapper = ImageModelWrapper() with self.assertRaises(ImageModelUnavailable): wrapper.predict([]) def test_model_bundle_can_be_saved_loaded_and_score_new_data(self): bundle = create_model_bundle( self.df, model_name="random_forest", target_defect_type="气泡", ) self.assertEqual("random_forest", bundle["model_name"]) self.assertEqual("气泡", bundle["target"]["defect_type"]) self.assertGreater(len(bundle["feature_columns"]), 0) self.assertIn("metrics", bundle) self.assertIn("validation_metrics", bundle) self.assertIn("feature_importance", bundle) self.assertGreater(len(bundle["feature_importance"]), 0) self.assertIn("feature", bundle["feature_importance"][0]) self.assertIn("importance", bundle["feature_importance"][0]) path = "tmp_test_model_bundle.joblib" try: save_model_bundle(bundle, path) loaded = load_model_bundle(path) scored = predict_with_bundle(loaded, self.df.tail(5)) finally: import os if os.path.exists(path): os.remove(path) self.assertEqual(5, len(scored)) self.assertIn("ml_probability", scored.columns) self.assertTrue(scored["ml_probability"].between(0, 1).all()) def test_model_bundle_aligns_missing_feature_columns_for_new_data(self): bundle = create_model_bundle( self.df, model_name="logistic_regression", target_defect_type="气泡", ) new_df = self.df.tail(3).copy() new_df["equipment_id"] = "NEW-LAM" new_df["seat_id"] = "NEW-SEAT" scored = predict_with_bundle(bundle, new_df) self.assertEqual(3, len(scored)) self.assertIn("ml_prediction", scored.columns) def test_bundle_report_excludes_model_object_and_keeps_audit_fields(self): bundle = create_model_bundle( self.df, model_name="random_forest", target_defect_type="气泡", ) report = build_bundle_report(bundle) self.assertNotIn("model", report) self.assertIn("validation_metrics", report) self.assertIn("feature_importance", report) self.assertGreater(report["feature_count"], 0) if __name__ == "__main__": unittest.main()