Просмотр исходного кода

增加秒过分数线数据导入程序

chengjie 1 неделя назад
Родитель
Сommit
8048dc25d6

+ 1 - 1
src/api/miaoguo/literacyController.js

@@ -445,7 +445,7 @@ async function getAiData(word){
445 445
 
446 446
     let aiProvider="";
447 447
     //aiProvider="doubao-seed-1-8-251228";
448
-    aiProvider="doubao-1-5-lite-32k-250115";
448
+    aiProvider="doubao-deepseek-v4-flash-260425";
449 449
     
450 450
     //生成结果
451 451
     result = await aiController.generateArticle(content, aiProvider);

+ 2 - 0
src/api/yjbdc/aiController.js

@@ -497,6 +497,8 @@ class AIProviderFactory {
497 497
                 return new VolcesAIProvider("deepseek-v3-1-250821");
498 498
             case 'doubao-deepseek-v3-2-251201':
499 499
                 return new VolcesAIProvider("deepseek-v3-2-251201");
500
+            case 'doubao-deepseek-v4-flash-260425':
501
+                return new VolcesAIProvider("deepseek-v4-flash-260425");
500 502
 
501 503
             case 'doubao-deepseek-r1-250528':
502 504
                 return new VolcesAIProvider("deepseek-r1-250528");

+ 1 - 1
src/api/yjbdc/yjbdcController.js

@@ -105,7 +105,7 @@ export async function GenerateArticle(ctx) {
105 105
                 //'doubao-kimi-k2-250711': 100,
106 106
                 //'doubao-deepseek-v3-2-251201': 100,
107 107
                 //'doubao-seed-1-8-251228': 100,
108
-                'doubao-1-5-lite-32k-250115':100,
108
+                'doubao-deepseek-v4-flash-260425':100,
109 109
                 //'ali-Moonshot-kimi-k2.5': 100,
110 110
             });
111 111
            

+ 2 - 1
src/util/constant/index.js

@@ -27,12 +27,13 @@ export default {
27 27
             {Name:"人生励志",ID:12,CSS:"",Eng:"Inspirational",English:"Inspirational",Content:"做一个有价值的人"}
28 28
         ];        
29 29
         result.AIVersion=[
30
-            {Version:"db20",BuildSecond:45,Model:"doubao-1-5-lite-32k-250115",Content:"字节豆包2.0\n平均45秒生成",CSS:""},
30
+            {Version:"zdp4f",BuildSecond:30,Model:"doubao-deepseek-v4-flash-260425",Content:"字节deepseek_v4flash\n平均30秒生成",CSS:""},
31 31
             {Version:"1.0",BuildSecond:45,Model:"",Content:"词句丰富,结构简明\n平均30秒生成",CSS:"Selected"},
32 32
             {Version:"1.5",BuildSecond:60,Model:"",Content:"深度表达,更多要素\n平均60秒生成",CSS:""},
33 33
             {Version:"db15",BuildSecond:30,Model:"doubao-1-5-pro-32k-250115",Content:"字节豆包1.5\n平均30秒生成",CSS:""},
34 34
             {Version:"db16",BuildSecond:60,Model:"doubao-seed-1-6-250615",Content:"字节豆包1.6\n平均60秒生成",CSS:""},
35 35
             {Version:"db18",BuildSecond:30,Model:"doubao-seed-1-8-251228",Content:"字节豆包1.8\n平均30秒生成",CSS:""},
36
+            {Version:"db20",BuildSecond:45,Model:"doubao-1-5-lite-32k-250115",Content:"字节豆包2.0\n平均45秒生成",CSS:""},
36 37
             
37 38
             {Version:"zdp3",BuildSecond:60,Model:"doubao-deepseek-v3-250324",Content:"字节deepseek_v3\n平均60秒生成",CSS:""},
38 39
             {Version:"zdp31",BuildSecond:30,Model:"doubao-deepseek-v3-1-250821",Content:"字节deepseek_v31\n平均30秒生成",CSS:""},

+ 41 - 5
秒过分数线数据导入/README.md

@@ -165,6 +165,21 @@ WHERE ScoreYear = '2025'
165 165
 - 对“原名(现某某)”这种文本,匹配时应同时尝试原名、括号内现名、去括号名称。
166 166
 - 图片清晰时可以 OCR/读图解决,但要把结果转成结构化行,再按学校表 ID 入库。
167 167
 
168
+2026 懿德中学问题复盘:
169
+
170
+- 触发点:复查浦东新区 `上海市浦东新区懿德中学` 时发现名额到校目标高中不对。
171
+- 直接错误:PDF 原始行中最后两列应为 `上海市浦东复旦附中分校` 和 `上海中学东校`,旧脚本分别写成了 `复旦大学附属中学` 和 `上海市上海中学`。
172
+- 根因:高中表头没有 6 位代码时,旧逻辑先做简称别名匹配,`上海市浦东复旦附中分校` 先命中 `复旦附中`,`上海中学东校` 先命中 `上海中学`,导致分校/东校被主校抢走。
173
+- 同类影响:普陀 `华二普陀`、宝山 `华二宝山` / `上师附中宝山`、浦东 `上海中学东校` / `浦东复旦附中分校` / `华二临港奉贤分校`、松江 `松江二中` / `华二松江分校`、奉贤 `华二临港奉贤分校`。
174
+- 修正原则:学校匹配顺序必须是“6 位代码优先,其次精确全称/简称/别名字段,最后才用简称兜底”;简称兜底还要按别名长度从长到短匹配,避免 `华二` 抢在 `华二普陀` 前面。
175
+- 额外问题:青浦区名额到校 PDF 是长表跨页,高中段落在表格抽取中会丢失,不能只依赖 `pdfplumber.extract_tables()` 的表格状态续接。
176
+- 第一次青浦修正仍有隐患:用 `extract_text()` 的自然文本顺序识别高中段落,会把视觉上同行的内容拆错。例如 PDF 表格中 `102056 上海交通大学附属中学 / 181021 上海市青浦区思源中学 / 1` 是同一行,但文本抽取顺序会先输出 `181021 上海市青浦区思源中学 1`,再输出 `102056 上海交通大学附属中学`,导致思源中学被错误归到上一段 `上海市上海中学`。
177
+- 最终修正方式:青浦区改用 `pdfplumber.find_tables()` 的行坐标作为主依据,再用左侧高中代码文字的坐标判断“从哪一行开始切换高中”。这样可以处理页 1 同行高中代码、页 2 跨页延续、页 3/页 4 中途切换高中段落等情况。
178
+- 青浦修正结果示例:`上海市上海中学` 只对应 `上海市青浦区凤溪中学`;`上海交通大学附属中学` 对应 `上海市青浦区思源中学` 和 `上海市青浦区实验中学`。
179
+- 同类风险扫描:已扫描 16 个区 PDF,仅青浦存在“左侧高中段落 + 右侧三列表格 + 文本顺序错位”的版式;其他区未发现同类结构。
180
+- 修正方式:备份受影响区旧数据到 `mps_score_school_quota_2026_bad_targets_backup.json`,再重建普陀、宝山、浦东、松江、青浦、奉贤 6 个区数据;后续又单独备份青浦旧数据到 `mps_score_school_quota_2026_qingpu_reparse_backup.json`,并重建青浦区数据。
181
+- 后续要求:遇到“分校、校区、东校、宝山、普陀、松江、临港奉贤”等表头,必须人工抽样检查目标高中 ID;遇到长表跨页或左右分栏版式,不能相信纯文本抽取顺序,必须结合表格行坐标或人工抽样;导入后必须做重复业务 key 检查,即 `ScoreYear + ScoreType + DistrictID + SchoolOfGraduation + SchoolTarget` 不应重复。
182
+
168 183
 ## 当前脚本说明
169 184
 
170 185
 脚本分为三类:主流程脚本、公共解析/补录脚本、2026 一次性补充脚本。后续年度工作时,主流程和公共脚本可以复制改年份;一次性补充脚本主要用于追溯 2026 的特殊处理,不建议直接运行到新年份。
@@ -184,6 +199,13 @@ WHERE ScoreYear = '2025'
184 199
 - 如果某区已存在数据,会跳过并报告。
185 200
 - 对图片或解析失败区,使用 `import_mps_score_quota_manual_2026.py` 做手工/OCR 补充。
186 201
 
202
+名额到区官方总表审核:
203
+
204
+- `audit_mps_score_quota_2026.py`:读取官方《2026全市高中名额到区招生计划统计表》PDF,并与数据库 `2026 名额到区` 按高中汇总结果比对。
205
+- 审核口径:以 PDF 中高中 6 位招生代码为准,对应 `MPS_School.SchoolNumber`,再比较官方计划数与数据库 `SUM(PlanNum)`。
206
+- 当前审核结果:官方 77 所、计划数 7171;数据库 77 所、计划数 7171;逐校差异 0。
207
+- 经验:只要官方发布全市/全区统计表,就应作为最终验算口径。它不能证明每一条初中分配都正确,但能快速发现高中目标映射错误、漏导、重复导入、计划数错位等系统性问题。
208
+
187 209
 名额到校:
188 210
 
189 211
 - `research_mps_score_school_quota_2026.py`
@@ -216,6 +238,15 @@ WHERE ScoreYear = '2025'
216 238
 
217 239
 - `__pycache__/` 和 `*.pyc` 是 Python 运行缓存,不属于业务数据或脚本,已在主仓库 `.gitignore` 中忽略。
218 240
 
241
+名额到校区内合计审核:
242
+
243
+- `audit_mps_score_school_quota_totals_2026.py`:读取各区名额到校 PDF 中明确存在的“合计”行,与数据库 `2026 名额到校` 按区/高中汇总结果比对。
244
+- 当前可自动审核区:长宁区、宝山区、金山区、松江区、崇明区。
245
+- 审核结果:上述 5 个区 PDF 合计与数据库逐项一致,差异 0。
246
+- 注意:有些 PDF 的合计行包含水印或序号类数字,例如金山、松江合计行中不参与总计的数字,脚本中已显式忽略;没有明确合计行的区不纳入此脚本自动审核。
247
+- 经验:名额到校审核要优先找 PDF 自带的“合计/总计”行。能自动审核的区,应至少核对区总计和高中列合计;没有合计行的区,也要尽量通过官方后续统计表、人工抽样、重复 key 检查来补充验证。
248
+- 边界:合计审核只能证明“高中列汇总”和“区总量”正确,不能完全证明每个初中分配行都正确;因此它应与学校匹配日志、问题清单、重复 key 检查一起使用。
249
+
219 250
 ## 2026 已完成结果
220 251
 
221 252
 计划/自主招生:
@@ -236,10 +267,15 @@ WHERE ScoreYear = '2025'
236 267
 
237 268
 - `ScoreYear = 2026`
238 269
 - `ScoreType = 名额到校`
239
-- 已导入 3892
240
-- 计划数合计 12833
270
+- 已导入 3893
271
+- 计划数合计 12887
241 272
 - 问题清单 `mps_score_school_quota_2026_problems.json` 已清空
242 273
 
274
+
275
+修正记录:
276
+
277
+- 2026-06-01 修正名额到校部分高中目标误匹配问题。原因是分校/校区/东校表头先命中了主校简称别名;同时修正青浦长表跨页高中段落解析。修正前旧数据已备份到 `mps_score_school_quota_2026_bad_targets_backup.json`。
278
+
243 279
 2026 名额到校最终分区汇总:
244 280
 
245 281
 | DistrictID | 区 | 行数 | 计划数 |
@@ -252,12 +288,12 @@ WHERE ScoreYear = '2025'
252 288
 | 6 | 虹口区 | 80 | 488 |
253 289
 | 7 | 杨浦区 | 144 | 707 |
254 290
 | 8 | 闵行区 | 460 | 1290 |
255
-| 9 | 宝山区 | 348 | 1076 |
291
+| 9 | 宝山区 | 343 | 1076 |
256 292
 | 10 | 嘉定区 | 130 | 612 |
257
-| 11 | 浦东新区 | 1259 | 2082 |
293
+| 11 | 浦东新区 | 1260 | 2095 |
258 294
 | 12 | 金山区 | 56 | 355 |
259 295
 | 13 | 松江区 | 190 | 779 |
260
-| 14 | 青浦区 | 93 | 725 |
296
+| 14 | 青浦区 | 98 | 766 |
261 297
 | 15 | 奉贤区 | 131 | 345 |
262 298
 | 16 | 崇明区 | 50 | 223 |
263 299
 

+ 160 - 0
秒过分数线数据导入/audit_mps_score_quota_2026.py

@@ -0,0 +1,160 @@
1
+import json
2
+import re
3
+import sys
4
+
5
+import pdfplumber
6
+
7
+sys.path.insert(0, "/private/tmp/codex_mysql_driver")
8
+import pymysql  # noqa: E402
9
+
10
+
11
+DB_CONFIG = {
12
+    "host": "589ae8e08493d.sh.cdb.myqcloud.com",
13
+    "port": 8124,
14
+    "user": "cdb_outerroot",
15
+    "password": "kylx!@#!QAZ@WSX",
16
+    "database": "kylx365_db",
17
+    "charset": "utf8mb4",
18
+    "connect_timeout": 10,
19
+    "read_timeout": 20,
20
+}
21
+
22
+YEAR = "2026"
23
+SCORE_TYPE = "名额到区"
24
+OFFICIAL_PDF = (
25
+    "/Volumes/程杰外接SD盘/上海中考招生计划/2026/计划/"
26
+    "2026全市高中名额到区扫生计划统计表.pdf"
27
+)
28
+
29
+
30
+def clean_text(value):
31
+    text = str(value or "").replace("\n", "")
32
+    text = re.sub(r"\s+", "", text)
33
+    return re.sub(r"^[院试考育教市海上]+(?=\d{6}|上海|复旦|华东|同济)", "", text)
34
+
35
+
36
+def clean_code(value):
37
+    match = re.search(r"\d{6}", str(value or ""))
38
+    return match.group(0) if match else None
39
+
40
+
41
+def clean_num(value):
42
+    nums = re.findall(r"\d+", str(value or ""))
43
+    return int(nums[-1]) if nums else None
44
+
45
+
46
+def parse_official_pdf(path):
47
+    official = {}
48
+    with pdfplumber.open(path) as pdf:
49
+        for page in pdf.pages:
50
+            for table in page.extract_tables():
51
+                for raw in table[1:]:
52
+                    if len(raw) < 9:
53
+                        continue
54
+                    code = clean_code(raw[1])
55
+                    plan = clean_num(raw[-1])
56
+                    if not code or plan is None:
57
+                        continue
58
+                    item = official.setdefault(
59
+                        code, {"SchoolNumber": code, "pdf_names": [], "official_plan": 0}
60
+                    )
61
+                    item["pdf_names"].append(clean_text(raw[2]))
62
+                    item["official_plan"] += plan
63
+    return official
64
+
65
+
66
+def load_db_summary(cursor):
67
+    cursor.execute(
68
+        """
69
+        SELECT ID, SchoolNumber, SchoolFullName
70
+        FROM MPS_School
71
+        WHERE SchoolType1 = '高中'
72
+        """
73
+    )
74
+    schools = cursor.fetchall()
75
+    school_by_id = {int(row["ID"]): row for row in schools}
76
+    school_by_code = {str(row["SchoolNumber"]): row for row in schools if row["SchoolNumber"]}
77
+
78
+    cursor.execute(
79
+        """
80
+        SELECT SchoolTarget, SchoolFullName, SUM(PlanNum) AS total
81
+        FROM MPS_Score
82
+        WHERE ScoreYear = %s AND ScoreType = %s
83
+        GROUP BY SchoolTarget, SchoolFullName
84
+        ORDER BY CAST(SchoolTarget AS UNSIGNED), SchoolTarget
85
+        """,
86
+        (YEAR, SCORE_TYPE),
87
+    )
88
+
89
+    db_by_code = {}
90
+    unmatched = []
91
+    for row in cursor.fetchall():
92
+        school = school_by_id.get(int(row["SchoolTarget"]))
93
+        if not school or not school.get("SchoolNumber"):
94
+            unmatched.append(row)
95
+            continue
96
+        code = str(school["SchoolNumber"])
97
+        item = db_by_code.setdefault(
98
+            code,
99
+            {
100
+                "SchoolNumber": code,
101
+                "SchoolTarget": school["ID"],
102
+                "db_name": row["SchoolFullName"],
103
+                "db_plan": 0,
104
+            },
105
+        )
106
+        item["db_plan"] += int(row["total"] or 0)
107
+
108
+    return school_by_code, db_by_code, unmatched
109
+
110
+
111
+def compare(official, school_by_code, db_by_code):
112
+    diffs = []
113
+    for code in sorted(set(official) | set(db_by_code)):
114
+        official_item = official.get(code)
115
+        db_item = db_by_code.get(code)
116
+        school = school_by_code.get(code)
117
+        row = {
118
+            "SchoolNumber": code,
119
+            "SchoolTarget": school["ID"] if school else (db_item or {}).get("SchoolTarget"),
120
+            "school_name": school["SchoolFullName"] if school else (db_item or {}).get("db_name"),
121
+            "pdf_name": " / ".join(dict.fromkeys((official_item or {}).get("pdf_names", []))),
122
+            "official_plan": (official_item or {}).get("official_plan", 0),
123
+            "db_plan": (db_item or {}).get("db_plan", 0),
124
+        }
125
+        row["delta_db_minus_official"] = row["db_plan"] - row["official_plan"]
126
+        if row["delta_db_minus_official"] or not official_item or not db_item:
127
+            if not official_item:
128
+                row["status"] = "db_only"
129
+            elif not db_item:
130
+                row["status"] = "official_only"
131
+            else:
132
+                row["status"] = "plan_mismatch"
133
+            diffs.append(row)
134
+    return diffs
135
+
136
+
137
+def main():
138
+    official = parse_official_pdf(OFFICIAL_PDF)
139
+    conn = pymysql.connect(**DB_CONFIG)
140
+    try:
141
+        with conn.cursor(pymysql.cursors.DictCursor) as cursor:
142
+            school_by_code, db_by_code, unmatched = load_db_summary(cursor)
143
+    finally:
144
+        conn.close()
145
+
146
+    diffs = compare(official, school_by_code, db_by_code)
147
+    result = {
148
+        "official_schools": len(official),
149
+        "official_total": sum(row["official_plan"] for row in official.values()),
150
+        "db_schools": len(db_by_code),
151
+        "db_total": sum(row["db_plan"] for row in db_by_code.values()),
152
+        "diff_count": len(diffs),
153
+        "unmatched_db_rows": unmatched,
154
+        "diffs": diffs,
155
+    }
156
+    print(json.dumps(result, ensure_ascii=False, default=str, indent=2))
157
+
158
+
159
+if __name__ == "__main__":
160
+    main()

+ 212 - 0
秒过分数线数据导入/audit_mps_score_school_quota_totals_2026.py

@@ -0,0 +1,212 @@
1
+import json
2
+import os
3
+import re
4
+import sys
5
+
6
+import pdfplumber
7
+
8
+sys.path.insert(0, "/private/tmp/codex_mysql_driver")
9
+import pymysql  # noqa: E402
10
+
11
+import research_mps_score_school_quota_2026 as parser  # noqa: E402
12
+
13
+
14
+YEAR = "2026"
15
+SCORE_TYPE = "名额到校"
16
+
17
+# Districts whose PDFs contain explicit footer totals.
18
+# Footer columns are configured because some PDFs do not repeat headers on the final page.
19
+AUDIT_CONFIG = {
20
+    3: [
21
+        {"kind": "school_total", "code": "052002"},
22
+        {"kind": "school_total", "code": "053004"},
23
+        {"kind": "school_total", "code": "052001"},
24
+        {"kind": "school_total", "code": "042032"},
25
+        {"kind": "school_total", "code": "102057"},
26
+        {"kind": "school_total", "code": "102056"},
27
+        {"kind": "school_total", "code": "152003"},
28
+        {"kind": "school_total", "code": "152006"},
29
+    ],
30
+    9: [
31
+        {"kind": "school_total", "code": "132001"},
32
+        {"kind": "school_total", "code": "133001"},
33
+        {"kind": "school_total", "code": "132002"},
34
+        {"kind": "school_total", "code": "133003"},
35
+        {"kind": "school_total", "code": "132003"},
36
+        {"kind": "school_total", "code": "042032"},
37
+        {"kind": "school_total", "code": "152003"},
38
+        {"kind": "school_total", "code": "102057"},
39
+        {"kind": "school_total", "code": "102056"},
40
+    ],
41
+    12: [
42
+        {"kind": "ignore", "label": "ignored_pdf_noise"},
43
+        {"kind": "school_total", "code": "162000"},
44
+        {"kind": "school_total", "code": "163002"},
45
+        {"kind": "district_total", "label": "合计"},
46
+    ],
47
+    13: [
48
+        {"kind": "ignore", "label": "ignored_pdf_noise"},
49
+        {"kind": "school_total", "code": "172001"},
50
+        {"kind": "school_total", "code": "173001"},
51
+        {"kind": "school_total", "code": "174003"},
52
+        {"kind": "school_total", "code": "172002"},
53
+        {"kind": "school_total", "code": "172004"},
54
+        {"kind": "district_total", "label": "合计"},
55
+    ],
56
+    16: [
57
+        {"kind": "school_total", "code": "512000"},
58
+        {"kind": "school_total", "code": "512001"},
59
+        {"kind": "school_total", "code": "042032"},
60
+        {"kind": "school_total", "code": "152003"},
61
+        {"kind": "school_total", "code": "102057"},
62
+        {"kind": "school_total", "code": "102056"},
63
+    ],
64
+}
65
+
66
+
67
+def clean_num(value):
68
+    nums = re.findall(r"-?\d+", parser.clean_text(value))
69
+    return int(nums[-1]) if nums else None
70
+
71
+
72
+def find_footer_numbers(path):
73
+    footers = []
74
+    with pdfplumber.open(path) as pdf:
75
+        for page in pdf.pages:
76
+            for table in page.extract_tables():
77
+                if not table:
78
+                    continue
79
+                for row in table:
80
+                    if any("合计" in parser.clean_text(cell) for cell in row):
81
+                        nums = [clean_num(cell) for cell in row]
82
+                        nums = [num for num in nums if num is not None]
83
+                        if len(nums) >= 2:
84
+                            footers.append(nums)
85
+    if not footers:
86
+        return []
87
+    return footers[-1]
88
+
89
+
90
+def load_db(cursor):
91
+    cursor.execute(
92
+        """
93
+        SELECT ID, SchoolNumber, SchoolFullName
94
+        FROM MPS_School
95
+        WHERE SchoolType1 = '高中'
96
+        """
97
+    )
98
+    school_by_code = {str(row["SchoolNumber"]): row for row in cursor.fetchall() if row["SchoolNumber"]}
99
+
100
+    cursor.execute(
101
+        """
102
+        SELECT DistrictID, SchoolTarget, SchoolFullName, SUM(PlanNum) AS total
103
+        FROM MPS_Score
104
+        WHERE ScoreYear = %s AND ScoreType = %s
105
+        GROUP BY DistrictID, SchoolTarget, SchoolFullName
106
+        """,
107
+        (YEAR, SCORE_TYPE),
108
+    )
109
+
110
+    by_district = {}
111
+    by_school = {}
112
+    for row in cursor.fetchall():
113
+        district_id = int(row["DistrictID"])
114
+        school_target = int(row["SchoolTarget"])
115
+        plan = int(row["total"] or 0)
116
+        by_district[district_id] = by_district.get(district_id, 0) + plan
117
+        by_school[(district_id, school_target)] = {"name": row["SchoolFullName"], "plan": plan}
118
+    return school_by_code, by_district, by_school
119
+
120
+
121
+def main():
122
+    conn = pymysql.connect(**parser.DB_CONFIG)
123
+    try:
124
+        with conn.cursor(pymysql.cursors.DictCursor) as cursor:
125
+            school_by_code, db_by_district, db_by_school = load_db(cursor)
126
+    finally:
127
+        conn.close()
128
+
129
+    audits = []
130
+    diffs = []
131
+    problems = {}
132
+
133
+    for district_id, config in AUDIT_CONFIG.items():
134
+        district_name = parser.DISTRICTS[district_id]
135
+        path = os.path.join(parser.BASE_DIR, f"2026名额到校{district_name}.pdf")
136
+        numbers = find_footer_numbers(path)
137
+        if len(numbers) != len(config):
138
+            problems[str(district_id)] = {
139
+                "district": district_name,
140
+                "problem": "footer_number_count_mismatch",
141
+                "numbers": numbers,
142
+                "expected_columns": len(config),
143
+            }
144
+            continue
145
+
146
+        listed_targets = []
147
+        official_items = []
148
+        for spec, official_plan in zip(config, numbers):
149
+            if spec["kind"] == "ignore":
150
+                continue
151
+            item = {"kind": spec["kind"], "official_plan": official_plan}
152
+            if spec["kind"] == "school_total":
153
+                school = school_by_code.get(spec["code"])
154
+                if not school:
155
+                    problems.setdefault(str(district_id), {"district": district_name, "problems": []}).setdefault("problems", []).append(
156
+                        {"problem": "school_code_not_found", "code": spec["code"]}
157
+                    )
158
+                    continue
159
+                item.update(
160
+                    {
161
+                        "SchoolNumber": spec["code"],
162
+                        "SchoolTarget": int(school["ID"]),
163
+                        "label": school["SchoolFullName"],
164
+                    }
165
+                )
166
+                listed_targets.append(int(school["ID"]))
167
+            else:
168
+                item["label"] = spec["label"]
169
+            official_items.append(item)
170
+
171
+        district_rows = []
172
+        for item in official_items:
173
+            if item["kind"] == "district_total":
174
+                db_plan = db_by_district.get(district_id, 0)
175
+            elif item["kind"] == "delegated_total":
176
+                db_plan = db_by_district.get(district_id, 0) - sum(
177
+                    db_by_school.get((district_id, target), {}).get("plan", 0)
178
+                    for target in listed_targets
179
+                )
180
+            else:
181
+                db_plan = db_by_school.get((district_id, item["SchoolTarget"]), {}).get("plan", 0)
182
+
183
+            row = {
184
+                **item,
185
+                "db_plan": db_plan,
186
+                "delta_db_minus_official": db_plan - item["official_plan"],
187
+            }
188
+            district_rows.append(row)
189
+            if row["delta_db_minus_official"]:
190
+                diffs.append({"DistrictID": district_id, "district": district_name, **row})
191
+
192
+        audits.append({"DistrictID": district_id, "district": district_name, "rows": district_rows})
193
+
194
+    print(
195
+        json.dumps(
196
+            {
197
+                "audited_districts": len(AUDIT_CONFIG),
198
+                "audited_district_ids": sorted(AUDIT_CONFIG),
199
+                "diff_count": len(diffs),
200
+                "diffs": diffs,
201
+                "problems": problems,
202
+                "audits": audits,
203
+            },
204
+            ensure_ascii=False,
205
+            default=str,
206
+            indent=2,
207
+        )
208
+    )
209
+
210
+
211
+if __name__ == "__main__":
212
+    main()

+ 146 - 0
秒过分数线数据导入/fix_mps_score_school_quota_2026_bad_targets.py

@@ -0,0 +1,146 @@
1
+import json
2
+import os
3
+import sys
4
+from collections import defaultdict
5
+from datetime import datetime
6
+
7
+sys.path.insert(0, "/private/tmp/codex_mysql_driver")
8
+import pymysql
9
+
10
+import research_mps_score_school_quota_2026 as parser
11
+from import_mps_score_school_quota_2026 import INSERT_COLUMNS, build_record, load_previous_plan_nums
12
+
13
+
14
+AFFECTED_DISTRICTS = [5, 9, 11, 13, 14, 15]
15
+BACKUP_FILE = "mps_score_school_quota_2026_bad_targets_backup.json"
16
+
17
+
18
+def aggregate_rows(rows):
19
+    grouped = {}
20
+    methods = defaultdict(lambda: [set(), set()])
21
+    for junior, high, plan_num, junior_method, high_method in rows:
22
+        key = (int(junior["ID"]), int(high["ID"]))
23
+        if key not in grouped:
24
+            grouped[key] = [junior, high, 0]
25
+        grouped[key][2] += int(plan_num)
26
+        methods[key][0].add(junior_method)
27
+        methods[key][1].add(high_method)
28
+    return [
29
+        (junior, high, plan, "+".join(sorted(methods[(int(junior["ID"]), int(high["ID"]))][0])), "+".join(sorted(methods[(int(junior["ID"]), int(high["ID"]))][1])))
30
+        for junior, high, plan in grouped.values()
31
+    ]
32
+
33
+
34
+def load_existing(cursor):
35
+    placeholders = ", ".join(["%s"] * len(AFFECTED_DISTRICTS))
36
+    cursor.execute(
37
+        f"""
38
+        SELECT *
39
+        FROM MPS_Score
40
+        WHERE ScoreYear = '2026'
41
+          AND ScoreType = '名额到校'
42
+          AND DistrictID IN ({placeholders})
43
+        ORDER BY DistrictID, SchoolOfGraduation, SchoolTarget, ID
44
+        """,
45
+        tuple(AFFECTED_DISTRICTS),
46
+    )
47
+    return cursor.fetchall()
48
+
49
+
50
+def collect_new_records(cursor):
51
+    high_by_code, high_by_name, _ = parser.load_schools(cursor, "高中")
52
+    junior_by_code, junior_by_name, _ = parser.load_schools(cursor, "初中")
53
+    previous = load_previous_plan_nums(cursor)
54
+
55
+    records_by_district = {}
56
+    problems_by_district = {}
57
+    for district_id in AFFECTED_DISTRICTS:
58
+        district_name = parser.DISTRICTS[district_id]
59
+        pdf_path = os.path.join(parser.BASE_DIR, f"2026名额到校{district_name}.pdf")
60
+        rows, problems = parser.parse_tables(
61
+            pdf_path, district_id, high_by_code, high_by_name, junior_by_code, junior_by_name
62
+        )
63
+        rows = aggregate_rows(rows)
64
+        records_by_district[district_id] = [build_record(district_id, row, previous) for row in rows]
65
+        if problems:
66
+            problems_by_district[str(district_id)] = {
67
+                "district": district_name,
68
+                "file": pdf_path,
69
+                "problem_count": len(problems),
70
+                "problems": [repr(item) for item in problems[:100]],
71
+            }
72
+    return records_by_district, problems_by_district
73
+
74
+
75
+def insert_records(cursor, records):
76
+    if not records:
77
+        return 0
78
+    columns = ", ".join(INSERT_COLUMNS)
79
+    placeholders = ", ".join(["%s"] * len(INSERT_COLUMNS))
80
+    sql = f"INSERT INTO MPS_Score ({columns}) VALUES ({placeholders})"
81
+    cursor.executemany(sql, [[row[column] for column in INSERT_COLUMNS] for row in records])
82
+    return len(records)
83
+
84
+
85
+def main():
86
+    conn = pymysql.connect(**parser.DB_CONFIG)
87
+    try:
88
+        with conn.cursor(pymysql.cursors.DictCursor) as cursor:
89
+            existing = load_existing(cursor)
90
+            records_by_district, problems = collect_new_records(cursor)
91
+            if problems:
92
+                raise RuntimeError(json.dumps(problems, ensure_ascii=False, indent=2))
93
+
94
+            backup = {
95
+                "created_at": datetime.now().isoformat(timespec="seconds"),
96
+                "affected_districts": AFFECTED_DISTRICTS,
97
+                "row_count": len(existing),
98
+                "rows": existing,
99
+            }
100
+            with open(BACKUP_FILE, "w", encoding="utf-8") as handle:
101
+                json.dump(backup, handle, ensure_ascii=False, indent=2, default=str)
102
+                handle.write("\n")
103
+
104
+            print("backup", BACKUP_FILE, "rows", len(existing))
105
+            for district_id in AFFECTED_DISTRICTS:
106
+                rows = records_by_district[district_id]
107
+                print(
108
+                    "ready",
109
+                    district_id,
110
+                    parser.DISTRICTS[district_id],
111
+                    "rows",
112
+                    len(rows),
113
+                    "plan",
114
+                    sum(row["PlanNum"] for row in rows),
115
+                )
116
+
117
+            placeholders = ", ".join(["%s"] * len(AFFECTED_DISTRICTS))
118
+            cursor.execute(
119
+                f"""
120
+                DELETE FROM MPS_Score
121
+                WHERE ScoreYear = '2026'
122
+                  AND ScoreType = '名额到校'
123
+                  AND DistrictID IN ({placeholders})
124
+                """,
125
+                tuple(AFFECTED_DISTRICTS),
126
+            )
127
+            deleted = cursor.rowcount
128
+
129
+            all_records = [
130
+                row
131
+                for district_id in AFFECTED_DISTRICTS
132
+                for row in records_by_district[district_id]
133
+            ]
134
+            inserted = insert_records(cursor, all_records)
135
+            conn.commit()
136
+            print("deleted", deleted)
137
+            print("inserted", inserted)
138
+    except Exception:
139
+        conn.rollback()
140
+        raise
141
+    finally:
142
+        conn.close()
143
+
144
+
145
+if __name__ == "__main__":
146
+    main()

Разница между файлами не показана из-за своего большого размера
+ 52814 - 0
秒过分数线数据导入/mps_score_school_quota_2026_bad_targets_backup.json


Разница между файлами не показана из-за своего большого размера
+ 2360 - 0
秒过分数线数据导入/mps_score_school_quota_2026_qingpu_reparse_backup.json


+ 73 - 4
秒过分数线数据导入/research_mps_score_school_quota_2026.py

@@ -157,10 +157,6 @@ def match_school(code, name, by_code, by_name, district_id=None):
157 157
     if code and code in by_code:
158 158
         return by_code[code], "code"
159 159
     cleaned = clean_text(name)
160
-    if by_code is not None:
161
-        for alias, alias_code in HIGH_ALIAS_CODES.items():
162
-            if alias in cleaned and alias_code in by_code:
163
-                return by_code[alias_code], f"alias:{alias}"
164 160
     candidates = []
165 161
     for variant in name_variants(name):
166 162
         if variant in by_name:
@@ -182,6 +178,10 @@ def match_school(code, name, by_code, by_name, district_id=None):
182 178
         return candidates[0], "name"
183 179
     if candidates:
184 180
         return None, f"ambiguous:{[row['SchoolFullName'] for row in candidates[:4]]}"
181
+    if by_code is not None:
182
+        for alias, alias_code in sorted(HIGH_ALIAS_CODES.items(), key=lambda item: len(item[0]), reverse=True):
183
+            if alias in cleaned and alias_code in by_code:
184
+                return by_code[alias_code], f"alias:{alias}"
185 185
     if district_id is not None:
186 186
         fuzzy_candidates = []
187 187
         for variant in name_variants(name):
@@ -209,6 +209,72 @@ def match_school(code, name, by_code, by_name, district_id=None):
209 209
     return None, "not_found"
210 210
 
211 211
 
212
+def parse_qingpu_text(path, high_by_code, junior_by_code):
213
+    rows = []
214
+    problems = []
215
+    current_high = None
216
+
217
+    with pdfplumber.open(path) as pdf:
218
+        for page in pdf.pages:
219
+            high_markers = []
220
+            for word in page.extract_words(x_tolerance=3, y_tolerance=3):
221
+                if word.get("x0", 999) > 230:
222
+                    continue
223
+                code = clean_code(word.get("text"))
224
+                if code in high_by_code:
225
+                    high_markers.append((float(word["top"]), code, high_by_code[code]))
226
+            high_markers.sort(key=lambda item: item[0])
227
+
228
+            for table in page.find_tables():
229
+                extracted = table.extract()
230
+                table_top = float(table.bbox[1])
231
+                preceding = [item for item in high_markers if item[0] < table_top]
232
+                if preceding:
233
+                    current_high = preceding[-1][2]
234
+
235
+                for index, raw in enumerate(extracted):
236
+                    if not raw:
237
+                        continue
238
+                    row_bbox = table.rows[index].bbox
239
+                    row_top, row_bottom = float(row_bbox[1]), float(row_bbox[3])
240
+                    markers_in_row = [
241
+                        item for item in high_markers if row_top - 3 <= item[0] <= row_bottom + 3
242
+                    ]
243
+                    if markers_in_row:
244
+                        current_high = markers_in_row[-1][2]
245
+
246
+                    if len(raw) >= 5 and any("招生学校代码" in clean_text(cell) for cell in raw):
247
+                        continue
248
+                    if len(raw) >= 5:
249
+                        high_code = clean_code(raw[0])
250
+                        if high_code in high_by_code:
251
+                            current_high = high_by_code[high_code]
252
+                        junior_code = clean_code(raw[2])
253
+                        junior_name = clean_text(raw[3])
254
+                        plan_num = clean_num(raw[4])
255
+                    elif len(raw) >= 3:
256
+                        junior_code = clean_code(raw[0])
257
+                        junior_name = clean_text(raw[1])
258
+                        plan_num = clean_num(raw[2])
259
+                    else:
260
+                        continue
261
+
262
+                    if not current_high:
263
+                        problems.append((raw, "high", "not_found"))
264
+                        continue
265
+                    if not junior_code and not junior_name:
266
+                        continue
267
+                    junior = junior_by_code.get(junior_code)
268
+                    if not junior:
269
+                        problems.append((raw, "code", "junior_not_found"))
270
+                        continue
271
+                    if plan_num is None or plan_num == 0:
272
+                        continue
273
+                    rows.append((junior, current_high, plan_num, "code", "code"))
274
+
275
+    return rows, problems
276
+
277
+
212 278
 def extract_codes_from_header(header_rows, col_index):
213 279
     for row in header_rows:
214 280
         if col_index < len(row):
@@ -346,6 +412,9 @@ def parse_matrix_table(table, district_id, high_by_code, high_by_name, junior_by
346 412
 
347 413
 
348 414
 def parse_tables(path, district_id, high_by_code, high_by_name, junior_by_code, junior_by_name):
415
+    if district_id == 14:
416
+        return parse_qingpu_text(path, high_by_code, junior_by_code)
417
+
349 418
     all_rows = []
350 419
     all_problems = []
351 420
     long_state = {}