generate_data.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. """
  2. 生成LCD/OLED屏幕检测模拟缺陷数据
  3. 模拟真实场景:边缘/角落缺陷更集中,某些时段缺陷更多,特定设备座号缺陷集中
  4. """
  5. import numpy as np
  6. import pandas as pd
  7. from datetime import datetime, timedelta
  8. import json
  9. import os
  10. np.random.seed(42)
  11. # --- 配置 ---
  12. NUM_PANELS = 500 # 检测面板总数
  13. OUTPUT_FILE = "defect_data.csv"
  14. # 面板尺寸 (mm)
  15. PANEL_WIDTH = 155.0
  16. PANEL_HEIGHT = 340.0
  17. # 前贴附制程设备配置
  18. # 模拟3台前贴附设备,每台有4x5=20个座号
  19. LAMINATION_EQUIPMENT = {
  20. "LAM-A01": {"rows": 4, "cols": 5, "total_seats": 20},
  21. "LAM-A02": {"rows": 4, "cols": 5, "total_seats": 20},
  22. "LAM-B01": {"rows": 5, "cols": 4, "total_seats": 20},
  23. }
  24. # 座号格式: 行号-列号,如 R1C1, R1C2, ...
  25. def get_seat_names(n_rows, n_cols):
  26. seats = []
  27. for r in range(1, n_rows + 1):
  28. for c in range(1, n_cols + 1):
  29. seats.append(f"R{r}C{c}")
  30. return seats
  31. # 模拟座号缺陷倾向(某些座号因设备问题缺陷更多)
  32. # LAM-A01 的 R2C3 座号吸嘴老化 → 气泡缺陷集中
  33. # LAM-A01 的 R4C1 座号加热不均 → 漏光缺陷集中
  34. # LAM-A02 的 R1C5 座号压力不均 → 色差缺陷集中
  35. # LAM-B01 的 R3C2 座号异物污染 → 异物缺陷集中
  36. SEAT_DEFECT_BIAS = {
  37. "LAM-A01_R2C3": {"defect_type": "气泡", "weight_boost": 3.0},
  38. "LAM-A01_R4C1": {"defect_type": "漏光", "weight_boost": 2.5},
  39. "LAM-A02_R1C5": {"defect_type": "色差", "weight_boost": 2.5},
  40. "LAM-B01_R3C2": {"defect_type": "异物", "weight_boost": 3.0},
  41. }
  42. # 缺陷类型及其权重(模拟帕累托分布:少数类型占多数)
  43. DEFECT_TYPES = {
  44. "划痕": 0.30,
  45. "亮点": 0.20,
  46. "暗点": 0.15,
  47. "气泡": 0.12,
  48. "色差": 0.08,
  49. "漏光": 0.07,
  50. "裂纹": 0.04,
  51. "异物": 0.04,
  52. }
  53. # 生产时间范围:模拟30天的数据
  54. START_DATE = datetime(2026, 4, 1, 8, 0, 0)
  55. END_DATE = datetime(2026, 4, 30, 20, 0, 0)
  56. # 前贴附制程检测工位 (AOI)
  57. INSPECTION_STATIONS = ["AOI-前贴附#1", "AOI-前贴附#2", "AOI-后段全检"]
  58. def generate_panel_positions():
  59. """生成面板位置分布,模拟空间集中性"""
  60. positions = []
  61. # 热点1:左边缘区域(贴合工艺问题)
  62. n1 = np.random.randint(200, 350)
  63. x1 = np.random.normal(8, 5, n1)
  64. y1 = np.random.uniform(20, PANEL_HEIGHT - 20, n1)
  65. # 热点2:右下角(受力集中区)
  66. n2 = np.random.randint(150, 280)
  67. x2 = np.random.normal(PANEL_WIDTH - 15, 8, n2)
  68. y2 = np.random.normal(PANEL_HEIGHT - 20, 15, n2)
  69. # 热点3:中心偏上(FPC绑定区域)
  70. n3 = np.random.randint(100, 200)
  71. x3 = np.random.normal(PANEL_WIDTH / 2, 20, n3)
  72. y3 = np.random.normal(PANEL_HEIGHT * 0.75, 12, n3)
  73. # 热点4:上边缘
  74. n4 = np.random.randint(80, 150)
  75. x4 = np.random.uniform(30, PANEL_WIDTH - 30, n4)
  76. y4 = np.random.normal(10, 4, n4)
  77. # 均匀分布的随机缺陷(背景噪声)
  78. n5 = np.random.randint(200, 400)
  79. x5 = np.random.uniform(5, PANEL_WIDTH - 5, n5)
  80. y5 = np.random.uniform(5, PANEL_HEIGHT - 5, n5)
  81. all_x = np.concatenate([x1, x2, x3, x4, x5])
  82. all_y = np.concatenate([y1, y2, y3, y4, y5])
  83. mask = (all_x >= 0) & (all_x <= PANEL_WIDTH) & (all_y >= 0) & (all_y <= PANEL_HEIGHT)
  84. positions = list(zip(np.clip(all_x[mask], 0, PANEL_WIDTH),
  85. np.clip(all_y[mask], 0, PANEL_HEIGHT)))
  86. return positions
  87. def generate_time_distribution(n_defects):
  88. """生成时间分布,模拟特定时段缺陷集中"""
  89. timestamps = []
  90. total_seconds = (END_DATE - START_DATE).total_seconds()
  91. for _ in range(n_defects):
  92. random_seconds = np.random.uniform(0, total_seconds)
  93. ts = START_DATE + timedelta(seconds=random_seconds)
  94. # 夜班(17:00-8:00)缺陷权重更高
  95. hour = ts.hour
  96. if hour >= 17 or hour < 8:
  97. if np.random.random() > 0.6:
  98. timestamps.append(ts)
  99. else:
  100. day_seconds = np.random.uniform(0, 9 * 3600)
  101. day_ts = ts.replace(hour=8) + timedelta(seconds=day_seconds)
  102. timestamps.append(day_ts)
  103. else:
  104. timestamps.append(ts)
  105. return timestamps
  106. def assign_equipment_and_seat(n_defects, timestamps):
  107. """为每个缺陷分配设备和座号"""
  108. equipment_list = list(LAMINATION_EQUIPMENT.keys())
  109. equipment_ids = []
  110. seat_ids = []
  111. for ts in timestamps:
  112. # 根据时间段分配设备(模拟不同班次使用不同设备)
  113. hour = ts.hour
  114. if hour < 12:
  115. eq_idx = 0 # 白班主要用 LAM-A01
  116. elif hour < 17:
  117. eq_idx = np.random.choice([0, 1]) # 下午两台都用
  118. else:
  119. eq_idx = np.random.choice([1, 2]) # 夜班用 LAM-A02 和 LAM-B01
  120. eq_id = equipment_list[eq_idx]
  121. eq_info = LAMINATION_EQUIPMENT[eq_id]
  122. seat_names = get_seat_names(eq_info["rows"], eq_info["cols"])
  123. seat = np.random.choice(seat_names)
  124. equipment_ids.append(eq_id)
  125. seat_ids.append(seat)
  126. return equipment_ids, seat_ids
  127. def generate_defect_type_with_seat_bias(n_defects, equipment_ids, seat_ids):
  128. """生成缺陷类型,考虑座号偏差"""
  129. types = list(DEFECT_TYPES.keys())
  130. weights = np.array(list(DEFECT_TYPES.values()))
  131. defect_type_list = []
  132. for i in range(n_defects):
  133. eq_id = equipment_ids[i]
  134. seat_id = seat_ids[i]
  135. key = f"{eq_id}_{seat_id}"
  136. if key in SEAT_DEFECT_BIAS:
  137. bias = SEAT_DEFECT_BIAS[key]
  138. # 创建新的权重分布,增加特定缺陷类型的概率
  139. biased_weights = weights.copy()
  140. type_idx = types.index(bias["defect_type"])
  141. biased_weights[type_idx] *= bias["weight_boost"]
  142. biased_weights /= biased_weights.sum()
  143. defect_type = np.random.choice(types, p=biased_weights)
  144. else:
  145. defect_type = np.random.choice(types, p=weights)
  146. defect_type_list.append(defect_type)
  147. return defect_type_list
  148. def generate_severity(defect_type):
  149. """根据缺陷类型生成严重程度"""
  150. severity_map = {
  151. "裂纹": np.random.choice(["严重", "中等"], p=[0.7, 0.3]),
  152. "漏光": np.random.choice(["严重", "中等", "轻微"], p=[0.4, 0.4, 0.2]),
  153. "划痕": np.random.choice(["严重", "中等", "轻微"], p=[0.2, 0.4, 0.4]),
  154. }
  155. return severity_map.get(defect_type,
  156. np.random.choice(["轻微", "中等", "严重"], p=[0.5, 0.35, 0.15]))
  157. def generate_geometry(defect_type):
  158. """按缺陷类型生成点/线/面几何属性。"""
  159. if defect_type in ["划痕", "裂纹"]:
  160. length = float(np.random.uniform(4, 35))
  161. width = float(np.random.uniform(0.05, 0.4))
  162. return {
  163. "defect_geometry_type": "line",
  164. "width_mm": round(width, 2),
  165. "height_mm": 0.0,
  166. "length_mm": round(length, 2),
  167. "angle_deg": round(float(np.random.uniform(0, 180)), 1),
  168. "area_mm2": round(length * width, 2),
  169. }
  170. if defect_type in ["漏光", "色差", "气泡"]:
  171. width = float(np.random.uniform(1.5, 12))
  172. height = float(np.random.uniform(1.5, 16))
  173. return {
  174. "defect_geometry_type": "region",
  175. "width_mm": round(width, 2),
  176. "height_mm": round(height, 2),
  177. "length_mm": 0.0,
  178. "angle_deg": 0.0,
  179. "area_mm2": round(width * height, 2),
  180. }
  181. size = float(np.random.uniform(0.05, 0.8))
  182. return {
  183. "defect_geometry_type": "point",
  184. "width_mm": round(size, 2),
  185. "height_mm": round(size, 2),
  186. "length_mm": 0.0,
  187. "angle_deg": 0.0,
  188. "area_mm2": round(size * size, 3),
  189. }
  190. def generate_data():
  191. """生成完整的缺陷数据集"""
  192. print("生成模拟缺陷数据...")
  193. # 生成空间位置
  194. positions = generate_panel_positions()
  195. n_defects = len(positions)
  196. print(f" 生成 {n_defects} 个缺陷记录")
  197. # 生成时间
  198. timestamps = generate_time_distribution(n_defects)
  199. # 分配设备和座号
  200. equipment_ids, seat_ids = assign_equipment_and_seat(n_defects, timestamps)
  201. # 生成缺陷类型(考虑座号偏差)
  202. defect_type_list = generate_defect_type_with_seat_bias(n_defects, equipment_ids, seat_ids)
  203. # 生成面板ID (模拟500块面板)
  204. panel_ids = [f"PANEL-{np.random.randint(1, NUM_PANELS+1):04d}" for _ in range(n_defects)]
  205. # 生成批次号
  206. batch_ids = [f"BATCH-{ts.strftime('%Y%m%d')}" for ts in timestamps]
  207. # 生成严重程度
  208. severities = [generate_severity(dt) for dt in defect_type_list]
  209. # 生成检测工位
  210. inspection_stations = [np.random.choice(INSPECTION_STATIONS, p=[0.4, 0.4, 0.2]) for _ in range(n_defects)]
  211. geometries = [generate_geometry(dt) for dt in defect_type_list]
  212. x_values = [round(p[0], 2) for p in positions]
  213. y_values = [round(p[1], 2) for p in positions]
  214. clean_equipment_ids = [f"CLN-{np.random.choice(['A01', 'A02'])}" for _ in range(n_defects)]
  215. clean_slot_ids = [f"SLOT-{np.random.randint(1, 13):02d}" for _ in range(n_defects)]
  216. bond_equipment_ids = [f"BON-{np.random.choice(['A01', 'A02', 'B01'])}" for _ in range(n_defects)]
  217. bond_head_ids = [f"HEAD-{np.random.randint(1, 7):02d}" for _ in range(n_defects)]
  218. recipe_ids = [f"RCP-LAM-{eq[-3:]}" for eq in equipment_ids]
  219. lam_fixture_ids = [f"FIX-{eq[-3:]}-{np.random.randint(1, 5):02d}" for eq in equipment_ids]
  220. lam_jig_ids = [f"JIG-{seat}" for seat in seat_ids]
  221. lam_nozzle_ids = [f"NZ-{np.random.randint(1, 21):02d}" for _ in range(n_defects)]
  222. material_lot_glass = [f"GLS-{ts.strftime('%Y%m%d')}-{np.random.randint(1, 4)}" for ts in timestamps]
  223. material_lot_oca = [f"OCA-{ts.strftime('%Y%W')}-{np.random.randint(1, 6)}" for ts in timestamps]
  224. material_lot_polarizer = [f"POL-{ts.strftime('%Y%W')}-{np.random.randint(1, 5)}" for ts in timestamps]
  225. material_lot_cover = [f"CVR-{ts.strftime('%Y%W')}-{np.random.randint(1, 4)}" for ts in timestamps]
  226. # 创建 DataFrame
  227. df = pd.DataFrame({
  228. "defect_id": [f"D{i+1:05d}" for i in range(n_defects)],
  229. "panel_id": panel_ids,
  230. "batch_id": batch_ids,
  231. "equipment_id": equipment_ids,
  232. "seat_id": seat_ids,
  233. "inspection_station": inspection_stations,
  234. "timestamp": timestamps,
  235. "defect_type": defect_type_list,
  236. "severity": severities,
  237. "x_mm": x_values,
  238. "y_mm": y_values,
  239. "panel_width_mm": PANEL_WIDTH,
  240. "panel_height_mm": PANEL_HEIGHT,
  241. "hour": [ts.hour for ts in timestamps],
  242. "shift": ["夜班" if (ts.hour >= 17 or ts.hour < 8) else "白班" for ts in timestamps],
  243. "day": [ts.strftime("%Y-%m-%d") for ts in timestamps],
  244. "defect_geometry_type": [g["defect_geometry_type"] for g in geometries],
  245. "width_mm": [g["width_mm"] for g in geometries],
  246. "height_mm": [g["height_mm"] for g in geometries],
  247. "length_mm": [g["length_mm"] for g in geometries],
  248. "angle_deg": [g["angle_deg"] for g in geometries],
  249. "area_mm2": [g["area_mm2"] for g in geometries],
  250. "bbox_x_min_mm": [max(0, x - g["width_mm"] / 2) for x, g in zip(x_values, geometries)],
  251. "bbox_y_min_mm": [max(0, y - g["height_mm"] / 2) for y, g in zip(y_values, geometries)],
  252. "bbox_x_max_mm": [min(PANEL_WIDTH, x + g["width_mm"] / 2) for x, g in zip(x_values, geometries)],
  253. "bbox_y_max_mm": [min(PANEL_HEIGHT, y + g["height_mm"] / 2) for y, g in zip(y_values, geometries)],
  254. "contour_json": "",
  255. "process_step": "前贴附",
  256. "recipe_id": recipe_ids,
  257. "operator_id": [f"OP-{np.random.randint(1, 9):02d}" for _ in range(n_defects)],
  258. "clean_equipment_id": clean_equipment_ids,
  259. "clean_slot_id": clean_slot_ids,
  260. "clean_recipe_id": [f"RCP-CLN-{eq[-3:]}" for eq in clean_equipment_ids],
  261. "lam_equipment_id": equipment_ids,
  262. "lam_seat_id": seat_ids,
  263. "lam_fixture_id": lam_fixture_ids,
  264. "lam_jig_id": lam_jig_ids,
  265. "lam_nozzle_id": lam_nozzle_ids,
  266. "bond_equipment_id": bond_equipment_ids,
  267. "bond_head_id": bond_head_ids,
  268. "bond_recipe_id": [f"RCP-BON-{eq[-3:]}" for eq in bond_equipment_ids],
  269. "aoi_equipment_id": inspection_stations,
  270. "aoi_station_id": inspection_stations,
  271. "material_lot_glass": material_lot_glass,
  272. "material_lot_oca": material_lot_oca,
  273. "material_lot_polarizer": material_lot_polarizer,
  274. "material_lot_cover": material_lot_cover,
  275. })
  276. # 保存
  277. df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
  278. print(f"数据已保存到 {OUTPUT_FILE}")
  279. # 保存统计摘要
  280. types = list(DEFECT_TYPES.keys())
  281. summary = {
  282. "total_defects": n_defects,
  283. "total_panels": NUM_PANELS,
  284. "defect_types": {t: int((df["defect_type"] == t).sum()) for t in types},
  285. "severity_distribution": {s: int((df["severity"] == s).sum()) for s in ["轻微", "中等", "严重"]},
  286. "shift_distribution": {s: int((df["shift"] == s).sum()) for s in ["白班", "夜班"]},
  287. "equipment_distribution": {e: int((df["equipment_id"] == e).sum()) for e in LAMINATION_EQUIPMENT.keys()},
  288. "date_range": {
  289. "start": START_DATE.strftime("%Y-%m-%d"),
  290. "end": END_DATE.strftime("%Y-%m-%d"),
  291. },
  292. "lamination_config": {
  293. "equipment": list(LAMINATION_EQUIPMENT.keys()),
  294. "seat_bias": {k: v["defect_type"] for k, v in SEAT_DEFECT_BIAS.items()},
  295. },
  296. }
  297. with open("data_summary.json", "w", encoding="utf-8") as f:
  298. json.dump(summary, f, ensure_ascii=False, indent=2)
  299. print(f"统计摘要已保存到 data_summary.json")
  300. return df
  301. if __name__ == "__main__":
  302. df = generate_data()
  303. print(f"\n数据概览:")
  304. print(f" 总记录数: {len(df)}")
  305. print(f" 缺陷类型数: {df['defect_type'].nunique()}")
  306. print(f" 面板数量: {df['panel_id'].nunique()}")
  307. print(f" 批次数量: {df['batch_id'].nunique()}")
  308. print(f" 设备数量: {df['equipment_id'].nunique()}")
  309. print(f" 座号数量: {df['seat_id'].nunique()}")
  310. print(f"\n缺陷类型分布:")
  311. print(df["defect_type"].value_counts().to_string())
  312. print(f"\n设备分布:")
  313. print(df["equipment_id"].value_counts().to_string())
  314. print(f"\n班次分布:")
  315. print(df["shift"].value_counts().to_string())