| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- """数据质量评估。"""
- from defect_analysis.schemas import CORE_REQUIRED_COLUMNS
- VALID_DEFECT_TYPES = {"划痕", "亮点", "暗点", "气泡", "色差", "漏光", "裂纹", "异物"}
- TRACEABILITY_COLUMNS = [
- "equipment_id",
- "seat_id",
- "lam_equipment_id",
- "lam_seat_id",
- "lam_fixture_id",
- "inspection_station",
- ]
- def _non_empty_rate(df, columns):
- existing = [column for column in columns if column in df.columns]
- if not existing or df.empty:
- return 0.0
- valid = df[existing].notna() & (df[existing].astype(str) != "")
- return float(valid.all(axis=1).mean())
- def build_data_quality_report(df):
- """生成生产级导入前的数据质量报告。"""
- if df.empty:
- return {
- "score": 0.0,
- "required_complete_rate": 0.0,
- "coordinate_valid_rate": 0.0,
- "enum_valid_rate": 0.0,
- "traceability_rate": 0.0,
- "duplicate_defect_rate": 0.0,
- "issues": ["数据为空"],
- }
- required_complete_rate = _non_empty_rate(df, CORE_REQUIRED_COLUMNS)
- coordinate_valid = (
- (df["x_mm"] >= 0)
- & (df["x_mm"] <= df["panel_width_mm"])
- & (df["y_mm"] >= 0)
- & (df["y_mm"] <= df["panel_height_mm"])
- )
- coordinate_valid_rate = float(coordinate_valid.mean())
- enum_valid_rate = float(df["defect_type"].isin(VALID_DEFECT_TYPES).mean())
- traceability_rate = _non_empty_rate(df, TRACEABILITY_COLUMNS)
- duplicate_defect_rate = float(df["defect_id"].duplicated().mean())
- score = (
- required_complete_rate * 30
- + coordinate_valid_rate * 25
- + enum_valid_rate * 20
- + traceability_rate * 20
- + (1 - duplicate_defect_rate) * 5
- )
- issues = []
- if required_complete_rate < 1:
- issues.append("必填字段存在空值")
- if coordinate_valid_rate < 1:
- issues.append("坐标存在超出面板范围的数据")
- if enum_valid_rate < 1:
- issues.append("缺陷类型存在未登记枚举")
- if traceability_rate < 1:
- issues.append("工序追溯字段覆盖不完整")
- if duplicate_defect_rate > 0:
- issues.append("defect_id 存在重复")
- return {
- "score": round(float(score), 1),
- "required_complete_rate": required_complete_rate,
- "coordinate_valid_rate": coordinate_valid_rate,
- "enum_valid_rate": enum_valid_rate,
- "traceability_rate": traceability_rate,
- "duplicate_defect_rate": duplicate_defect_rate,
- "issues": issues or ["数据质量良好"],
- }
|