|
|
@@ -213,41 +213,66 @@ def parse_qingpu_text(path, high_by_code, junior_by_code):
|
|
213
|
213
|
rows = []
|
|
214
|
214
|
problems = []
|
|
215
|
215
|
current_high = None
|
|
|
216
|
+ current_high_code = None
|
|
216
|
217
|
|
|
217
|
218
|
with pdfplumber.open(path) as pdf:
|
|
218
|
219
|
for page in pdf.pages:
|
|
219
|
|
- high_markers = []
|
|
|
220
|
+ page_high_codes = []
|
|
220
|
221
|
for word in page.extract_words(x_tolerance=3, y_tolerance=3):
|
|
221
|
222
|
if word.get("x0", 999) > 230:
|
|
222
|
223
|
continue
|
|
223
|
224
|
code = clean_code(word.get("text"))
|
|
224
|
225
|
if code in high_by_code:
|
|
225
|
|
- high_markers.append((float(word["top"]), code, high_by_code[code]))
|
|
226
|
|
- high_markers.sort(key=lambda item: item[0])
|
|
|
226
|
+ if code not in page_high_codes:
|
|
|
227
|
+ page_high_codes.append(code)
|
|
227
|
228
|
|
|
228
|
229
|
for table in page.find_tables():
|
|
229
|
230
|
extracted = table.extract()
|
|
230
|
|
- table_top = float(table.bbox[1])
|
|
231
|
|
- preceding = [item for item in high_markers if item[0] < table_top]
|
|
232
|
|
- if preceding:
|
|
233
|
|
- current_high = preceding[-1][2]
|
|
|
231
|
+ table_top, table_bottom = float(table.bbox[1]), float(table.bbox[3])
|
|
|
232
|
+ is_continuation_table = extracted and len(extracted[0]) == 3
|
|
|
233
|
+ boundary_switches = {}
|
|
|
234
|
+ full_width_boundaries = []
|
|
|
235
|
+ for edge in page.edges:
|
|
|
236
|
+ if edge.get("orientation") != "h":
|
|
|
237
|
+ continue
|
|
|
238
|
+ top = float(edge["top"])
|
|
|
239
|
+ if (
|
|
|
240
|
+ float(edge["x0"]) < 100
|
|
|
241
|
+ and float(edge["x1"]) > 500
|
|
|
242
|
+ and table_top + 3 < top < table_bottom - 3
|
|
|
243
|
+ ):
|
|
|
244
|
+ if not full_width_boundaries or abs(top - full_width_boundaries[-1]) > 2:
|
|
|
245
|
+ full_width_boundaries.append(top)
|
|
|
246
|
+
|
|
|
247
|
+ if is_continuation_table:
|
|
|
248
|
+ if page_high_codes:
|
|
|
249
|
+ current_high_code = page_high_codes[0]
|
|
|
250
|
+ current_high = high_by_code[current_high_code]
|
|
|
251
|
+ next_high_codes = page_high_codes[1:]
|
|
|
252
|
+ switch_pairs = zip(full_width_boundaries, next_high_codes)
|
|
|
253
|
+ else:
|
|
|
254
|
+ switch_pairs = zip(full_width_boundaries, page_high_codes)
|
|
|
255
|
+
|
|
|
256
|
+ for boundary, code in switch_pairs:
|
|
|
257
|
+ switch_index = min(
|
|
|
258
|
+ range(len(table.rows)),
|
|
|
259
|
+ key=lambda index: abs(float(table.rows[index].bbox[1]) - boundary),
|
|
|
260
|
+ )
|
|
|
261
|
+ boundary_switches[switch_index] = code
|
|
234
|
262
|
|
|
235
|
263
|
for index, raw in enumerate(extracted):
|
|
236
|
264
|
if not raw:
|
|
237
|
265
|
continue
|
|
238
|
|
- row_bbox = table.rows[index].bbox
|
|
239
|
|
- row_top, row_bottom = float(row_bbox[1]), float(row_bbox[3])
|
|
240
|
|
- markers_in_row = [
|
|
241
|
|
- item for item in high_markers if row_top - 3 <= item[0] <= row_bottom + 3
|
|
242
|
|
- ]
|
|
243
|
|
- if markers_in_row:
|
|
244
|
|
- current_high = markers_in_row[-1][2]
|
|
|
266
|
+ if index in boundary_switches:
|
|
|
267
|
+ current_high_code = boundary_switches[index]
|
|
|
268
|
+ current_high = high_by_code[current_high_code]
|
|
245
|
269
|
|
|
246
|
270
|
if len(raw) >= 5 and any("招生学校代码" in clean_text(cell) for cell in raw):
|
|
247
|
271
|
continue
|
|
248
|
272
|
if len(raw) >= 5:
|
|
249
|
273
|
high_code = clean_code(raw[0])
|
|
250
|
274
|
if high_code in high_by_code:
|
|
|
275
|
+ current_high_code = high_code
|
|
251
|
276
|
current_high = high_by_code[high_code]
|
|
252
|
277
|
junior_code = clean_code(raw[2])
|
|
253
|
278
|
junior_name = clean_text(raw[3])
|