generate_data.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. """
  2. 生成LCD/OLED屏幕检测模拟缺陷数据
  3. 模拟真实场景:边缘/角落缺陷更集中,某些时段缺陷更多,特定设备座号缺陷集中
  4. """
  5. import numpy as np
  6. import pandas as pd
  7. from datetime import datetime, timedelta
  8. import json
  9. import os
  10. np.random.seed(42)
  11. # --- 配置 ---
  12. NUM_PANELS = 500 # 检测面板总数
  13. OUTPUT_FILE = "defect_data.csv"
  14. # 面板尺寸 (mm)
  15. PANEL_WIDTH = 155.0
  16. PANEL_HEIGHT = 340.0
  17. # 前贴附制程设备配置
  18. # 模拟3台前贴附设备,每台有4x5=20个座号
  19. LAMINATION_EQUIPMENT = {
  20. "LAM-A01": {"rows": 4, "cols": 5, "total_seats": 20},
  21. "LAM-A02": {"rows": 4, "cols": 5, "total_seats": 20},
  22. "LAM-B01": {"rows": 5, "cols": 4, "total_seats": 20},
  23. }
  24. # 座号格式: 行号-列号,如 R1C1, R1C2, ...
  25. def get_seat_names(n_rows, n_cols):
  26. seats = []
  27. for r in range(1, n_rows + 1):
  28. for c in range(1, n_cols + 1):
  29. seats.append(f"R{r}C{c}")
  30. return seats
  31. # 模拟座号缺陷倾向(某些座号因设备问题缺陷更多)
  32. # LAM-A01 的 R2C3 座号吸嘴老化 → 气泡缺陷集中
  33. # LAM-A01 的 R4C1 座号加热不均 → 漏光缺陷集中
  34. # LAM-A02 的 R1C5 座号压力不均 → 色差缺陷集中
  35. # LAM-B01 的 R3C2 座号异物污染 → 异物缺陷集中
  36. SEAT_DEFECT_BIAS = {
  37. "LAM-A01_R2C3": {"defect_type": "气泡", "weight_boost": 3.0},
  38. "LAM-A01_R4C1": {"defect_type": "漏光", "weight_boost": 2.5},
  39. "LAM-A02_R1C5": {"defect_type": "色差", "weight_boost": 2.5},
  40. "LAM-B01_R3C2": {"defect_type": "异物", "weight_boost": 3.0},
  41. }
  42. # 缺陷类型及其权重(模拟帕累托分布:少数类型占多数)
  43. DEFECT_TYPES = {
  44. "划痕": 0.30,
  45. "亮点": 0.20,
  46. "暗点": 0.15,
  47. "气泡": 0.12,
  48. "色差": 0.08,
  49. "漏光": 0.07,
  50. "裂纹": 0.04,
  51. "异物": 0.04,
  52. }
  53. # 生产时间范围:模拟30天的数据
  54. START_DATE = datetime(2026, 4, 1, 8, 0, 0)
  55. END_DATE = datetime(2026, 4, 30, 20, 0, 0)
  56. # 前贴附制程检测工位 (AOI)
  57. INSPECTION_STATIONS = ["AOI-前贴附#1", "AOI-前贴附#2", "AOI-后段全检"]
  58. def generate_panel_positions():
  59. """生成面板位置分布,模拟空间集中性"""
  60. positions = []
  61. # 热点1:左边缘区域(贴合工艺问题)
  62. n1 = np.random.randint(200, 350)
  63. x1 = np.random.normal(8, 5, n1)
  64. y1 = np.random.uniform(20, PANEL_HEIGHT - 20, n1)
  65. # 热点2:右下角(受力集中区)
  66. n2 = np.random.randint(150, 280)
  67. x2 = np.random.normal(PANEL_WIDTH - 15, 8, n2)
  68. y2 = np.random.normal(PANEL_HEIGHT - 20, 15, n2)
  69. # 热点3:中心偏上(FPC绑定区域)
  70. n3 = np.random.randint(100, 200)
  71. x3 = np.random.normal(PANEL_WIDTH / 2, 20, n3)
  72. y3 = np.random.normal(PANEL_HEIGHT * 0.75, 12, n3)
  73. # 热点4:上边缘
  74. n4 = np.random.randint(80, 150)
  75. x4 = np.random.uniform(30, PANEL_WIDTH - 30, n4)
  76. y4 = np.random.normal(10, 4, n4)
  77. # 均匀分布的随机缺陷(背景噪声)
  78. n5 = np.random.randint(200, 400)
  79. x5 = np.random.uniform(5, PANEL_WIDTH - 5, n5)
  80. y5 = np.random.uniform(5, PANEL_HEIGHT - 5, n5)
  81. all_x = np.concatenate([x1, x2, x3, x4, x5])
  82. all_y = np.concatenate([y1, y2, y3, y4, y5])
  83. mask = (all_x >= 0) & (all_x <= PANEL_WIDTH) & (all_y >= 0) & (all_y <= PANEL_HEIGHT)
  84. positions = list(zip(np.clip(all_x[mask], 0, PANEL_WIDTH),
  85. np.clip(all_y[mask], 0, PANEL_HEIGHT)))
  86. return positions
  87. def generate_time_distribution(n_defects):
  88. """生成时间分布,模拟特定时段缺陷集中"""
  89. timestamps = []
  90. total_seconds = (END_DATE - START_DATE).total_seconds()
  91. for _ in range(n_defects):
  92. random_seconds = np.random.uniform(0, total_seconds)
  93. ts = START_DATE + timedelta(seconds=random_seconds)
  94. # 夜班(17:00-8:00)缺陷权重更高
  95. hour = ts.hour
  96. if hour >= 17 or hour < 8:
  97. if np.random.random() > 0.6:
  98. timestamps.append(ts)
  99. else:
  100. day_seconds = np.random.uniform(0, 9 * 3600)
  101. day_ts = ts.replace(hour=8) + timedelta(seconds=day_seconds)
  102. timestamps.append(day_ts)
  103. else:
  104. timestamps.append(ts)
  105. return timestamps
  106. def assign_equipment_and_seat(n_defects, timestamps):
  107. """为每个缺陷分配设备和座号"""
  108. equipment_list = list(LAMINATION_EQUIPMENT.keys())
  109. equipment_ids = []
  110. seat_ids = []
  111. for ts in timestamps:
  112. # 根据时间段分配设备(模拟不同班次使用不同设备)
  113. hour = ts.hour
  114. if hour < 12:
  115. eq_idx = 0 # 白班主要用 LAM-A01
  116. elif hour < 17:
  117. eq_idx = np.random.choice([0, 1]) # 下午两台都用
  118. else:
  119. eq_idx = np.random.choice([1, 2]) # 夜班用 LAM-A02 和 LAM-B01
  120. eq_id = equipment_list[eq_idx]
  121. eq_info = LAMINATION_EQUIPMENT[eq_id]
  122. seat_names = get_seat_names(eq_info["rows"], eq_info["cols"])
  123. seat = np.random.choice(seat_names)
  124. equipment_ids.append(eq_id)
  125. seat_ids.append(seat)
  126. return equipment_ids, seat_ids
  127. def generate_defect_type_with_seat_bias(n_defects, equipment_ids, seat_ids):
  128. """生成缺陷类型,考虑座号偏差"""
  129. types = list(DEFECT_TYPES.keys())
  130. weights = np.array(list(DEFECT_TYPES.values()))
  131. defect_type_list = []
  132. for i in range(n_defects):
  133. eq_id = equipment_ids[i]
  134. seat_id = seat_ids[i]
  135. key = f"{eq_id}_{seat_id}"
  136. if key in SEAT_DEFECT_BIAS:
  137. bias = SEAT_DEFECT_BIAS[key]
  138. # 创建新的权重分布,增加特定缺陷类型的概率
  139. biased_weights = weights.copy()
  140. type_idx = types.index(bias["defect_type"])
  141. biased_weights[type_idx] *= bias["weight_boost"]
  142. biased_weights /= biased_weights.sum()
  143. defect_type = np.random.choice(types, p=biased_weights)
  144. else:
  145. defect_type = np.random.choice(types, p=weights)
  146. defect_type_list.append(defect_type)
  147. return defect_type_list
  148. def generate_severity(defect_type):
  149. """根据缺陷类型生成严重程度"""
  150. severity_map = {
  151. "裂纹": np.random.choice(["严重", "中等"], p=[0.7, 0.3]),
  152. "漏光": np.random.choice(["严重", "中等", "轻微"], p=[0.4, 0.4, 0.2]),
  153. "划痕": np.random.choice(["严重", "中等", "轻微"], p=[0.2, 0.4, 0.4]),
  154. }
  155. return severity_map.get(defect_type,
  156. np.random.choice(["轻微", "中等", "严重"], p=[0.5, 0.35, 0.15]))
  157. def generate_data():
  158. """生成完整的缺陷数据集"""
  159. print("生成模拟缺陷数据...")
  160. # 生成空间位置
  161. positions = generate_panel_positions()
  162. n_defects = len(positions)
  163. print(f" 生成 {n_defects} 个缺陷记录")
  164. # 生成时间
  165. timestamps = generate_time_distribution(n_defects)
  166. # 分配设备和座号
  167. equipment_ids, seat_ids = assign_equipment_and_seat(n_defects, timestamps)
  168. # 生成缺陷类型(考虑座号偏差)
  169. defect_type_list = generate_defect_type_with_seat_bias(n_defects, equipment_ids, seat_ids)
  170. # 生成面板ID (模拟500块面板)
  171. panel_ids = [f"PANEL-{np.random.randint(1, NUM_PANELS+1):04d}" for _ in range(n_defects)]
  172. # 生成批次号
  173. batch_ids = [f"BATCH-{ts.strftime('%Y%m%d')}" for ts in timestamps]
  174. # 生成严重程度
  175. severities = [generate_severity(dt) for dt in defect_type_list]
  176. # 生成检测工位
  177. inspection_stations = [np.random.choice(INSPECTION_STATIONS, p=[0.4, 0.4, 0.2]) for _ in range(n_defects)]
  178. # 创建 DataFrame
  179. df = pd.DataFrame({
  180. "defect_id": [f"D{i+1:05d}" for i in range(n_defects)],
  181. "panel_id": panel_ids,
  182. "batch_id": batch_ids,
  183. "equipment_id": equipment_ids,
  184. "seat_id": seat_ids,
  185. "inspection_station": inspection_stations,
  186. "timestamp": timestamps,
  187. "defect_type": defect_type_list,
  188. "severity": severities,
  189. "x_mm": [round(p[0], 2) for p in positions],
  190. "y_mm": [round(p[1], 2) for p in positions],
  191. "panel_width_mm": PANEL_WIDTH,
  192. "panel_height_mm": PANEL_HEIGHT,
  193. "hour": [ts.hour for ts in timestamps],
  194. "shift": ["夜班" if (ts.hour >= 17 or ts.hour < 8) else "白班" for ts in timestamps],
  195. "day": [ts.strftime("%Y-%m-%d") for ts in timestamps],
  196. })
  197. # 保存
  198. df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
  199. print(f"数据已保存到 {OUTPUT_FILE}")
  200. # 保存统计摘要
  201. types = list(DEFECT_TYPES.keys())
  202. summary = {
  203. "total_defects": n_defects,
  204. "total_panels": NUM_PANELS,
  205. "defect_types": {t: int((df["defect_type"] == t).sum()) for t in types},
  206. "severity_distribution": {s: int((df["severity"] == s).sum()) for s in ["轻微", "中等", "严重"]},
  207. "shift_distribution": {s: int((df["shift"] == s).sum()) for s in ["白班", "夜班"]},
  208. "equipment_distribution": {e: int((df["equipment_id"] == e).sum()) for e in LAMINATION_EQUIPMENT.keys()},
  209. "date_range": {
  210. "start": START_DATE.strftime("%Y-%m-%d"),
  211. "end": END_DATE.strftime("%Y-%m-%d"),
  212. },
  213. "lamination_config": {
  214. "equipment": list(LAMINATION_EQUIPMENT.keys()),
  215. "seat_bias": {k: v["defect_type"] for k, v in SEAT_DEFECT_BIAS.items()},
  216. },
  217. }
  218. with open("data_summary.json", "w", encoding="utf-8") as f:
  219. json.dump(summary, f, ensure_ascii=False, indent=2)
  220. print(f"统计摘要已保存到 data_summary.json")
  221. return df
  222. if __name__ == "__main__":
  223. df = generate_data()
  224. print(f"\n数据概览:")
  225. print(f" 总记录数: {len(df)}")
  226. print(f" 缺陷类型数: {df['defect_type'].nunique()}")
  227. print(f" 面板数量: {df['panel_id'].nunique()}")
  228. print(f" 批次数量: {df['batch_id'].nunique()}")
  229. print(f" 设备数量: {df['equipment_id'].nunique()}")
  230. print(f" 座号数量: {df['seat_id'].nunique()}")
  231. print(f"\n缺陷类型分布:")
  232. print(df["defect_type"].value_counts().to_string())
  233. print(f"\n设备分布:")
  234. print(df["equipment_id"].value_counts().to_string())
  235. print(f"\n班次分布:")
  236. print(df["shift"].value_counts().to_string())