"""数据质量评估。""" from defect_analysis.schemas import CORE_REQUIRED_COLUMNS VALID_DEFECT_TYPES = {"划痕", "亮点", "暗点", "气泡", "色差", "漏光", "裂纹", "异物"} TRACEABILITY_COLUMNS = [ "equipment_id", "seat_id", "lam_equipment_id", "lam_seat_id", "lam_fixture_id", "inspection_station", ] def _non_empty_rate(df, columns): existing = [column for column in columns if column in df.columns] if not existing or df.empty: return 0.0 valid = df[existing].notna() & (df[existing].astype(str) != "") return float(valid.all(axis=1).mean()) def build_data_quality_report(df): """生成生产级导入前的数据质量报告。""" if df.empty: return { "score": 0.0, "required_complete_rate": 0.0, "coordinate_valid_rate": 0.0, "enum_valid_rate": 0.0, "traceability_rate": 0.0, "duplicate_defect_rate": 0.0, "issues": ["数据为空"], } required_complete_rate = _non_empty_rate(df, CORE_REQUIRED_COLUMNS) coordinate_valid = ( (df["x_mm"] >= 0) & (df["x_mm"] <= df["panel_width_mm"]) & (df["y_mm"] >= 0) & (df["y_mm"] <= df["panel_height_mm"]) ) coordinate_valid_rate = float(coordinate_valid.mean()) enum_valid_rate = float(df["defect_type"].isin(VALID_DEFECT_TYPES).mean()) traceability_rate = _non_empty_rate(df, TRACEABILITY_COLUMNS) duplicate_defect_rate = float(df["defect_id"].duplicated().mean()) score = ( required_complete_rate * 30 + coordinate_valid_rate * 25 + enum_valid_rate * 20 + traceability_rate * 20 + (1 - duplicate_defect_rate) * 5 ) issues = [] if required_complete_rate < 1: issues.append("必填字段存在空值") if coordinate_valid_rate < 1: issues.append("坐标存在超出面板范围的数据") if enum_valid_rate < 1: issues.append("缺陷类型存在未登记枚举") if traceability_rate < 1: issues.append("工序追溯字段覆盖不完整") if duplicate_defect_rate > 0: issues.append("defect_id 存在重复") return { "score": round(float(score), 1), "required_complete_rate": required_complete_rate, "coordinate_valid_rate": coordinate_valid_rate, "enum_valid_rate": enum_valid_rate, "traceability_rate": traceability_rate, "duplicate_defect_rate": duplicate_defect_rate, "issues": issues or ["数据质量良好"], }