data_quality.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. """数据质量评估。"""
  2. from defect_analysis.schemas import CORE_REQUIRED_COLUMNS
  3. VALID_DEFECT_TYPES = {"划痕", "亮点", "暗点", "气泡", "色差", "漏光", "裂纹", "异物"}
  4. TRACEABILITY_COLUMNS = [
  5. "equipment_id",
  6. "seat_id",
  7. "lam_equipment_id",
  8. "lam_seat_id",
  9. "lam_fixture_id",
  10. "inspection_station",
  11. ]
  12. def _non_empty_rate(df, columns):
  13. existing = [column for column in columns if column in df.columns]
  14. if not existing or df.empty:
  15. return 0.0
  16. valid = df[existing].notna() & (df[existing].astype(str) != "")
  17. return float(valid.all(axis=1).mean())
  18. def build_data_quality_report(df):
  19. """生成生产级导入前的数据质量报告。"""
  20. if df.empty:
  21. return {
  22. "score": 0.0,
  23. "required_complete_rate": 0.0,
  24. "coordinate_valid_rate": 0.0,
  25. "enum_valid_rate": 0.0,
  26. "traceability_rate": 0.0,
  27. "duplicate_defect_rate": 0.0,
  28. "issues": ["数据为空"],
  29. }
  30. missing_columns = [column for column in CORE_REQUIRED_COLUMNS if column not in df.columns]
  31. required_complete_rate = _non_empty_rate(df, CORE_REQUIRED_COLUMNS)
  32. coordinate_columns = ["x_mm", "y_mm", "panel_width_mm", "panel_height_mm"]
  33. if all(column in df.columns for column in coordinate_columns):
  34. coordinate_valid = (
  35. (df["x_mm"] >= 0)
  36. & (df["x_mm"] <= df["panel_width_mm"])
  37. & (df["y_mm"] >= 0)
  38. & (df["y_mm"] <= df["panel_height_mm"])
  39. )
  40. coordinate_valid_rate = float(coordinate_valid.mean())
  41. else:
  42. coordinate_valid_rate = 0.0
  43. enum_valid_rate = float(df["defect_type"].isin(VALID_DEFECT_TYPES).mean()) if "defect_type" in df.columns else 0.0
  44. traceability_rate = _non_empty_rate(df, TRACEABILITY_COLUMNS)
  45. duplicate_defect_rate = float(df["defect_id"].duplicated().mean())
  46. score = (
  47. required_complete_rate * 30
  48. + coordinate_valid_rate * 25
  49. + enum_valid_rate * 20
  50. + traceability_rate * 20
  51. + (1 - duplicate_defect_rate) * 5
  52. )
  53. issues = []
  54. if required_complete_rate < 1:
  55. issues.append("必填字段存在空值")
  56. if missing_columns:
  57. issues.append("缺少必填字段: " + ", ".join(missing_columns))
  58. if coordinate_valid_rate < 1:
  59. issues.append("坐标存在超出面板范围的数据")
  60. if enum_valid_rate < 1:
  61. issues.append("缺陷类型存在未登记枚举")
  62. if traceability_rate < 1:
  63. issues.append("工序追溯字段覆盖不完整")
  64. if duplicate_defect_rate > 0:
  65. issues.append("defect_id 存在重复")
  66. return {
  67. "score": round(float(score), 1),
  68. "required_complete_rate": required_complete_rate,
  69. "coordinate_valid_rate": coordinate_valid_rate,
  70. "enum_valid_rate": enum_valid_rate,
  71. "traceability_rate": traceability_rate,
  72. "duplicate_defect_rate": duplicate_defect_rate,
  73. "issues": issues or ["数据质量良好"],
  74. }