chengjie 4 dni temu
rodzic
commit
7013174e7a

Plik diff jest za duży
+ 28 - 5
秒过分数线数据导入/README.md


Plik diff jest za duży
+ 2361 - 0
秒过分数线数据导入/mps_score_school_quota_2026_qingpu_cross_page_fix_backup.json


+ 39 - 14
秒过分数线数据导入/research_mps_score_school_quota_2026.py

@@ -213,41 +213,66 @@ def parse_qingpu_text(path, high_by_code, junior_by_code):
213 213
     rows = []
214 214
     problems = []
215 215
     current_high = None
216
+    current_high_code = None
216 217
 
217 218
     with pdfplumber.open(path) as pdf:
218 219
         for page in pdf.pages:
219
-            high_markers = []
220
+            page_high_codes = []
220 221
             for word in page.extract_words(x_tolerance=3, y_tolerance=3):
221 222
                 if word.get("x0", 999) > 230:
222 223
                     continue
223 224
                 code = clean_code(word.get("text"))
224 225
                 if code in high_by_code:
225
-                    high_markers.append((float(word["top"]), code, high_by_code[code]))
226
-            high_markers.sort(key=lambda item: item[0])
226
+                    if code not in page_high_codes:
227
+                        page_high_codes.append(code)
227 228
 
228 229
             for table in page.find_tables():
229 230
                 extracted = table.extract()
230
-                table_top = float(table.bbox[1])
231
-                preceding = [item for item in high_markers if item[0] < table_top]
232
-                if preceding:
233
-                    current_high = preceding[-1][2]
231
+                table_top, table_bottom = float(table.bbox[1]), float(table.bbox[3])
232
+                is_continuation_table = extracted and len(extracted[0]) == 3
233
+                boundary_switches = {}
234
+                full_width_boundaries = []
235
+                for edge in page.edges:
236
+                    if edge.get("orientation") != "h":
237
+                        continue
238
+                    top = float(edge["top"])
239
+                    if (
240
+                        float(edge["x0"]) < 100
241
+                        and float(edge["x1"]) > 500
242
+                        and table_top + 3 < top < table_bottom - 3
243
+                    ):
244
+                        if not full_width_boundaries or abs(top - full_width_boundaries[-1]) > 2:
245
+                            full_width_boundaries.append(top)
246
+
247
+                if is_continuation_table:
248
+                    if page_high_codes:
249
+                        current_high_code = page_high_codes[0]
250
+                        current_high = high_by_code[current_high_code]
251
+                    next_high_codes = page_high_codes[1:]
252
+                    switch_pairs = zip(full_width_boundaries, next_high_codes)
253
+                else:
254
+                    switch_pairs = zip(full_width_boundaries, page_high_codes)
255
+
256
+                for boundary, code in switch_pairs:
257
+                    switch_index = min(
258
+                        range(len(table.rows)),
259
+                        key=lambda index: abs(float(table.rows[index].bbox[1]) - boundary),
260
+                    )
261
+                    boundary_switches[switch_index] = code
234 262
 
235 263
                 for index, raw in enumerate(extracted):
236 264
                     if not raw:
237 265
                         continue
238
-                    row_bbox = table.rows[index].bbox
239
-                    row_top, row_bottom = float(row_bbox[1]), float(row_bbox[3])
240
-                    markers_in_row = [
241
-                        item for item in high_markers if row_top - 3 <= item[0] <= row_bottom + 3
242
-                    ]
243
-                    if markers_in_row:
244
-                        current_high = markers_in_row[-1][2]
266
+                    if index in boundary_switches:
267
+                        current_high_code = boundary_switches[index]
268
+                        current_high = high_by_code[current_high_code]
245 269
 
246 270
                     if len(raw) >= 5 and any("招生学校代码" in clean_text(cell) for cell in raw):
247 271
                         continue
248 272
                     if len(raw) >= 5:
249 273
                         high_code = clean_code(raw[0])
250 274
                         if high_code in high_by_code:
275
+                            current_high_code = high_code
251 276
                             current_high = high_by_code[high_code]
252 277
                         junior_code = clean_code(raw[2])
253 278
                         junior_name = clean_text(raw[3])

+ 34 - 1
秒过分数线数据导入/需求.md

@@ -75,7 +75,9 @@ SELECT * FROM kylx365_db.MPS_Score where ScoreYear='2025' and ScoreType='名额
75 75
         case 16:
76 76
             return "崇明区";
77 77
 
78
-如果遇到问题,整个区就先不动。等全部处理完,我们来处理问题区的数据。——————————————————————
78
+如果遇到问题,整个区就先不动。等全部处理完,我们来处理问题区的数据。
79
+
80
+——————————————————————
79 81
 
80 82
 接下来,将pdf或图片中的“名额到校”数据输入到数据库表中。
81 83
 
@@ -96,3 +98,34 @@ SELECT * FROM kylx365_db.MPS_Score where ScoreYear='2025' and ScoreType='名额
96 98
 
97 99
 一共16个区。
98 100
 
101
+——————————————————————————现在需要你帮我处理秒过分数线的1-15志愿数据
102
+
103
+之前所有的需求与处理过程看下面
104
+/Users/chengjie/Documents/git/miaoguo_system_server/秒过分数线数据导入/需求.md
105
+/Users/chengjie/Documents/git/miaoguo_system_server/秒过分数线数据导入/README.md
106
+
107
+之前完成的工作有
108
+- 自主招生
109
+- 名额到区
110
+- 名额到校
111
+
112
+现在需要做 1-15志愿
113
+
114
+下面是2025年的老数据
115
+/Volumes/程杰外接SD盘/上海中考招生计划/2025/计划/1-15志愿
116
+(你可以先看黄浦区)
117
+
118
+下面是2025年导入的数据表
119
+select * FROM kylx365_db.MPS_Score where ScoreYear='2025' and ScoreType='1-15志愿' and DistrictID=1; 
120
+
121
+下面是所有高中的数据,包括全称、简称和别名,你可以对照
122
+SELECT * FROM kylx365_db.MPS_School where SchoolType1='高中';
123
+
124
+当下,就是需要你处理
125
+/Volumes/程杰外接SD盘/上海中考招生计划/2026/计划/1-15志愿
126
+下的所有数据。
127
+
128
+你先研究起来,如果明白了,可以做黄浦区(其他区我会陆续放到上面目录)
129
+
130
+/Volumes/程杰外接SD盘/上海中考招生计划/2026/计划/1-15志愿/黄浦区1.png
131
+/Volumes/程杰外接SD盘/上海中考招生计划/2026/计划/1-15志愿/黄浦区2.png