| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- """数据质量评估。"""
- from defect_analysis.schemas import CORE_REQUIRED_COLUMNS
- VALID_DEFECT_TYPES = {"划痕", "亮点", "暗点", "气泡", "色差", "漏光", "裂纹", "异物"}
- TRACEABILITY_COLUMNS = [
- "equipment_id",
- "seat_id",
- "lam_equipment_id",
- "lam_seat_id",
- "lam_fixture_id",
- "inspection_station",
- ]
- def _non_empty_rate(df, columns):
- existing = [column for column in columns if column in df.columns]
- if not existing or df.empty:
- return 0.0
- valid = df[existing].notna() & (df[existing].astype(str) != "")
- return float(valid.all(axis=1).mean())
- def build_data_quality_report(df):
- """生成生产级导入前的数据质量报告。"""
- if df.empty:
- return {
- "score": 0.0,
- "required_complete_rate": 0.0,
- "coordinate_valid_rate": 0.0,
- "enum_valid_rate": 0.0,
- "traceability_rate": 0.0,
- "duplicate_defect_rate": 0.0,
- "issues": ["数据为空"],
- }
- missing_columns = [column for column in CORE_REQUIRED_COLUMNS if column not in df.columns]
- required_complete_rate = _non_empty_rate(df, CORE_REQUIRED_COLUMNS)
- coordinate_columns = ["x_mm", "y_mm", "panel_width_mm", "panel_height_mm"]
- if all(column in df.columns for column in coordinate_columns):
- coordinate_valid = (
- (df["x_mm"] >= 0)
- & (df["x_mm"] <= df["panel_width_mm"])
- & (df["y_mm"] >= 0)
- & (df["y_mm"] <= df["panel_height_mm"])
- )
- coordinate_valid_rate = float(coordinate_valid.mean())
- else:
- coordinate_valid_rate = 0.0
- enum_valid_rate = float(df["defect_type"].isin(VALID_DEFECT_TYPES).mean()) if "defect_type" in df.columns else 0.0
- traceability_rate = _non_empty_rate(df, TRACEABILITY_COLUMNS)
- duplicate_defect_rate = float(df["defect_id"].duplicated().mean())
- score = (
- required_complete_rate * 30
- + coordinate_valid_rate * 25
- + enum_valid_rate * 20
- + traceability_rate * 20
- + (1 - duplicate_defect_rate) * 5
- )
- issues = []
- if required_complete_rate < 1:
- issues.append("必填字段存在空值")
- if missing_columns:
- issues.append("缺少必填字段: " + ", ".join(missing_columns))
- if coordinate_valid_rate < 1:
- issues.append("坐标存在超出面板范围的数据")
- if enum_valid_rate < 1:
- issues.append("缺陷类型存在未登记枚举")
- if traceability_rate < 1:
- issues.append("工序追溯字段覆盖不完整")
- if duplicate_defect_rate > 0:
- issues.append("defect_id 存在重复")
- return {
- "score": round(float(score), 1),
- "required_complete_rate": required_complete_rate,
- "coordinate_valid_rate": coordinate_valid_rate,
- "enum_valid_rate": enum_valid_rate,
- "traceability_rate": traceability_rate,
- "duplicate_defect_rate": duplicate_defect_rate,
- "issues": issues or ["数据质量良好"],
- }
|