пре 1 месец · 7013174e7a
--- a/秒过分数线数据导入/README.md
+++ b/秒过分数线数据导入/README.md
--- a/秒过分数线数据导入/mps_score_school_quota_2026_qingpu_cross_page_fix_backup.json
+++ b/秒过分数线数据导入/mps_score_school_quota_2026_qingpu_cross_page_fix_backup.json
--- a/秒过分数线数据导入/research_mps_score_school_quota_2026.py
+++ b/秒过分数线数据导入/research_mps_score_school_quota_2026.py
@@ -213,41 +213,66 @@ def parse_qingpu_text(path, high_by_code, junior_by_code):
 
				     rows = []
			
 
				     problems = []
			
 
				     current_high = None
			
 
				+    current_high_code = None
			
 
				 
			
 
				     with pdfplumber.open(path) as pdf:
			
 
				         for page in pdf.pages:
			
 
				-            high_markers = []
			
 
				+            page_high_codes = []
			
 
				             for word in page.extract_words(x_tolerance=3, y_tolerance=3):
			
 
				                 if word.get("x0", 999) > 230:
			
 
				                     continue
			
 
				                 code = clean_code(word.get("text"))
			
 
				                 if code in high_by_code:
			
 
				-                    high_markers.append((float(word["top"]), code, high_by_code[code]))
			
 
				-            high_markers.sort(key=lambda item: item[0])
			
 
				+                    if code not in page_high_codes:
			
 
				+                        page_high_codes.append(code)
			
 
				 
			
 
				             for table in page.find_tables():
			
 
				                 extracted = table.extract()
			
 
				-                table_top = float(table.bbox[1])
			
 
				-                preceding = [item for item in high_markers if item[0] < table_top]
			
 
				-                if preceding:
			
 
				-                    current_high = preceding[-1][2]
			
 
				+                table_top, table_bottom = float(table.bbox[1]), float(table.bbox[3])
			
 
				+                is_continuation_table = extracted and len(extracted[0]) == 3
			
 
				+                boundary_switches = {}
			
 
				+                full_width_boundaries = []
			
 
				+                for edge in page.edges:
			
 
				+                    if edge.get("orientation") != "h":
			
 
				+                        continue
			
 
				+                    top = float(edge["top"])
			
 
				+                    if (
			
 
				+                        float(edge["x0"]) < 100
			
 
				+                        and float(edge["x1"]) > 500
			
 
				+                        and table_top + 3 < top < table_bottom - 3
			
 
				+                    ):
			
 
				+                        if not full_width_boundaries or abs(top - full_width_boundaries[-1]) > 2:
			
 
				+                            full_width_boundaries.append(top)
			
 
				+
			
 
				+                if is_continuation_table:
			
 
				+                    if page_high_codes:
			
 
				+                        current_high_code = page_high_codes[0]
			
 
				+                        current_high = high_by_code[current_high_code]
			
 
				+                    next_high_codes = page_high_codes[1:]
			
 
				+                    switch_pairs = zip(full_width_boundaries, next_high_codes)
			
 
				+                else:
			
 
				+                    switch_pairs = zip(full_width_boundaries, page_high_codes)
			
 
				+
			
 
				+                for boundary, code in switch_pairs:
			
 
				+                    switch_index = min(
			
 
				+                        range(len(table.rows)),
			
 
				+                        key=lambda index: abs(float(table.rows[index].bbox[1]) - boundary),
			
 
				+                    )
			
 
				+                    boundary_switches[switch_index] = code
			
 
				 
			
 
				                 for index, raw in enumerate(extracted):
			
 
				                     if not raw:
			
 
				                         continue
			
 
				-                    row_bbox = table.rows[index].bbox
			
 
				-                    row_top, row_bottom = float(row_bbox[1]), float(row_bbox[3])
			
 
				-                    markers_in_row = [
			
 
				-                        item for item in high_markers if row_top - 3 <= item[0] <= row_bottom + 3
			
 
				-                    ]
			
 
				-                    if markers_in_row:
			
 
				-                        current_high = markers_in_row[-1][2]
			
 
				+                    if index in boundary_switches:
			
 
				+                        current_high_code = boundary_switches[index]
			
 
				+                        current_high = high_by_code[current_high_code]
			
 
				 
			
 
				                     if len(raw) >= 5 and any("招生学校代码" in clean_text(cell) for cell in raw):
			
 
				                         continue
			
 
				                     if len(raw) >= 5:
			
 
				                         high_code = clean_code(raw[0])
			
 
				                         if high_code in high_by_code:
			
 
				+                            current_high_code = high_code
			
 
				                             current_high = high_by_code[high_code]
			
 
				                         junior_code = clean_code(raw[2])
			
 
				                         junior_name = clean_text(raw[3])
			
--- a/秒过分数线数据导入/需求.md
+++ b/秒过分数线数据导入/需求.md
@@ -75,7 +75,9 @@ SELECT * FROM kylx365_db.MPS_Score where ScoreYear='2025' and ScoreType='名额
 
				         case 16:
			
 
				             return "崇明区";
			
 
				 
			
 
				-如果遇到问题，整个区就先不动。等全部处理完，我们来处理问题区的数据。——————————————————————
			
 
				+如果遇到问题，整个区就先不动。等全部处理完，我们来处理问题区的数据。
			
 
				+
			
 
				+——————————————————————
			
 
				 
			
 
				 接下来，将pdf或图片中的“名额到校”数据输入到数据库表中。
			
 
				 
			
@@ -96,3 +98,34 @@ SELECT * FROM kylx365_db.MPS_Score where ScoreYear='2025' and ScoreType='名额
 
				 
			
 
				 一共16个区。
			
 
				 
			
 
				+——————————————————————————现在需要你帮我处理秒过分数线的1-15志愿数据
			
 
				+
			
 
				+之前所有的需求与处理过程看下面
			
 
				+/Users/chengjie/Documents/git/miaoguo_system_server/秒过分数线数据导入/需求.md
			
 
				+/Users/chengjie/Documents/git/miaoguo_system_server/秒过分数线数据导入/README.md
			
 
				+
			
 
				+之前完成的工作有
			
 
				+- 自主招生
			
 
				+- 名额到区
			
 
				+- 名额到校
			
 
				+
			
 
				+现在需要做 1-15志愿
			
 
				+
			
 
				+下面是2025年的老数据
			
 
				+/Volumes/程杰外接SD盘/上海中考招生计划/2025/计划/1-15志愿
			
 
				+（你可以先看黄浦区）
			
 
				+
			
 
				+下面是2025年导入的数据表
			
 
				+select * FROM kylx365_db.MPS_Score where ScoreYear='2025' and ScoreType='1-15志愿' and DistrictID=1; 
			
 
				+
			
 
				+下面是所有高中的数据，包括全称、简称和别名，你可以对照
			
 
				+SELECT * FROM kylx365_db.MPS_School where SchoolType1='高中';
			
 
				+
			
 
				+当下，就是需要你处理
			
 
				+/Volumes/程杰外接SD盘/上海中考招生计划/2026/计划/1-15志愿
			
 
				+下的所有数据。
			
 
				+
			
 
				+你先研究起来，如果明白了，可以做黄浦区（其他区我会陆续放到上面目录）
			
 
				+
			
 
				+/Volumes/程杰外接SD盘/上海中考招生计划/2026/计划/1-15志愿/黄浦区1.png
			
 
				+/Volumes/程杰外接SD盘/上海中考招生计划/2026/计划/1-15志愿/黄浦区2.png