data_quality.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. """数据质量评估。"""
  2. from defect_analysis.schemas import CORE_REQUIRED_COLUMNS
  3. VALID_DEFECT_TYPES = {"划痕", "亮点", "暗点", "气泡", "色差", "漏光", "裂纹", "异物"}
  4. TRACEABILITY_COLUMNS = [
  5. "equipment_id",
  6. "seat_id",
  7. "lam_equipment_id",
  8. "lam_seat_id",
  9. "lam_fixture_id",
  10. "inspection_station",
  11. ]
  12. def _non_empty_rate(df, columns):
  13. existing = [column for column in columns if column in df.columns]
  14. if not existing or df.empty:
  15. return 0.0
  16. valid = df[existing].notna() & (df[existing].astype(str) != "")
  17. return float(valid.all(axis=1).mean())
  18. def build_data_quality_report(df):
  19. """生成生产级导入前的数据质量报告。"""
  20. if df.empty:
  21. return {
  22. "score": 0.0,
  23. "required_complete_rate": 0.0,
  24. "coordinate_valid_rate": 0.0,
  25. "enum_valid_rate": 0.0,
  26. "traceability_rate": 0.0,
  27. "duplicate_defect_rate": 0.0,
  28. "issues": ["数据为空"],
  29. }
  30. required_complete_rate = _non_empty_rate(df, CORE_REQUIRED_COLUMNS)
  31. coordinate_valid = (
  32. (df["x_mm"] >= 0)
  33. & (df["x_mm"] <= df["panel_width_mm"])
  34. & (df["y_mm"] >= 0)
  35. & (df["y_mm"] <= df["panel_height_mm"])
  36. )
  37. coordinate_valid_rate = float(coordinate_valid.mean())
  38. enum_valid_rate = float(df["defect_type"].isin(VALID_DEFECT_TYPES).mean())
  39. traceability_rate = _non_empty_rate(df, TRACEABILITY_COLUMNS)
  40. duplicate_defect_rate = float(df["defect_id"].duplicated().mean())
  41. score = (
  42. required_complete_rate * 30
  43. + coordinate_valid_rate * 25
  44. + enum_valid_rate * 20
  45. + traceability_rate * 20
  46. + (1 - duplicate_defect_rate) * 5
  47. )
  48. issues = []
  49. if required_complete_rate < 1:
  50. issues.append("必填字段存在空值")
  51. if coordinate_valid_rate < 1:
  52. issues.append("坐标存在超出面板范围的数据")
  53. if enum_valid_rate < 1:
  54. issues.append("缺陷类型存在未登记枚举")
  55. if traceability_rate < 1:
  56. issues.append("工序追溯字段覆盖不完整")
  57. if duplicate_defect_rate > 0:
  58. issues.append("defect_id 存在重复")
  59. return {
  60. "score": round(float(score), 1),
  61. "required_complete_rate": required_complete_rate,
  62. "coordinate_valid_rate": coordinate_valid_rate,
  63. "enum_valid_rate": enum_valid_rate,
  64. "traceability_rate": traceability_rate,
  65. "duplicate_defect_rate": duplicate_defect_rate,
  66. "issues": issues or ["数据质量良好"],
  67. }