chengjie 2 月之前
父節點
當前提交
04869f98fa
共有 5 個文件被更改,包括 2163 次插入1766 次删除
  1. 1339 1765
      package-lock.json
  2. 5 1
      package.json
  3. 538 0
      src/web_crawler/crawler_yinsha.js
  4. 249 0
      src/web_crawler/epub_generator.js
  5. 32 0
      src/web_crawler/epub_styles.css

File diff suppressed because it is too large
+ 1339 - 1765
package-lock.json


+ 5 - 1
package.json

@@ -26,9 +26,12 @@
26 26
     "@koa/multer": "^3.0.2",
27 27
     "@koa/router": "^13.1.0",
28 28
     "axios": "^1.9.0",
29
+    "cheerio": "^1.1.2",
29 30
     "child_process": "^1.0.2",
30 31
     "cos-nodejs-sdk-v5": "^2.16.0-beta.3",
31 32
     "gm": "^1.25.1",
33
+    "jsdom": "^27.0.0",
34
+    "jszip": "^3.10.1",
32 35
     "koa": "^2.13.4",
33 36
     "koa-bodyparser": "^4.4.1",
34 37
     "koa-router": "^13.0.1",
@@ -40,7 +43,8 @@
40 43
     "mysql2": "^3.14.1",
41 44
     "pdfkit": "^0.17.1",
42 45
     "request-promise": "^4.2.6",
43
-    "tencentcloud-sdk-nodejs-ocr": "^4.1.70"
46
+    "tencentcloud-sdk-nodejs-ocr": "^4.1.70",
47
+    "uuid": "^13.0.0"
44 48
   },
45 49
   "devDependencies": {
46 50
     "cross-env": "^7.0.3"

+ 538 - 0
src/web_crawler/crawler_yinsha.js

@@ -0,0 +1,538 @@
1
+import fs from 'fs';
2
+import axios from 'axios';
3
+import * as cheerio from 'cheerio';
4
+import path from 'path';
5
+import { setTimeout } from 'timers/promises';
6
+
7
+/**
8
+ * 格式化章节内容
9
+ * @param {string} content - 原始文本内容
10
+ * @returns {string} - 格式化后的HTML内容
11
+ */
12
+async function formatChapterContent(content) {
13
+    // 将章节标题替换为<h2>
14
+    content = content.replace(/Chapter \d+/g, match => `<h2>${match}</h2>`);
15
+    
16
+    // 将正文段落用<p>包裹
17
+    content = content.replace(/\n\n/g, '</p><p>');
18
+    content = `<p>${content}</p>`;
19
+    
20
+    return content;
21
+}
22
+
23
+/**
24
+ * 处理文本文件,为章节标题添加h2标签,为段落添加p标签
25
+ * @param {string} filePath - 文本文件路径
26
+ * @param {string} outputPath - 输出文件路径
27
+ * @returns {Promise<void>}
28
+ */
29
+async function formatTextFile(filePath, outputPath) {
30
+    try {
31
+        // 读取文本文件
32
+        const content = await fs.promises.readFile(filePath, 'utf-8');
33
+        console.log(`已读取文件: ${filePath}`);
34
+        
35
+        // 分割成章节
36
+        const chapters = content.split(/Chapter \d+/).filter(Boolean);
37
+        console.log(`检测到 ${chapters.length} 个章节内容块`);
38
+        
39
+        let formattedContent = '';
40
+        let chapterIndex = 1;
41
+        
42
+        // 处理每个章节
43
+        for (const chapter of chapters) {
44
+            // 添加章节标题
45
+            formattedContent += `<h2>Chapter ${chapterIndex}</h2>\n`;
46
+            
47
+            // 处理章节内容,将段落用<p>标签包裹
48
+            const paragraphs = chapter.trim().split(/\n\s*\n/);
49
+            for (const paragraph of paragraphs) {
50
+                if (paragraph.trim()) {
51
+                    formattedContent += `<p>${paragraph.trim()}</p>\n`;
52
+                }
53
+            }
54
+            
55
+            chapterIndex++;
56
+        }
57
+        
58
+        // 写入输出文件
59
+        await fs.promises.writeFile(outputPath, formattedContent);
60
+        console.log(`格式化完成! 结果已保存到: ${outputPath}`);
61
+        console.log(`共处理了 ${chapters.length} 个章节`);
62
+        
63
+    } catch (error) {
64
+        console.error('处理文本文件时出错:', error);
65
+    }
66
+}
67
+
68
+async function mergeChapterFiles(sourceDir, outputFile) {
69
+    try {
70
+        const SEPARATOR = '\n******************\n';
71
+        const { readdir, readFile, writeFile } = fs.promises;
72
+
73
+        // 获取所有HTML文件并按章节顺序排序
74
+        const files = (await readdir(sourceDir))
75
+            .filter(file => file.endsWith('.html'))
76
+            .sort((a, b) => parseInt(a.split('_')[0]) - parseInt(b.split('_')[0]));
77
+
78
+        if (files.length === 0) {
79
+            console.error('未找到任何章节文件');
80
+            return;
81
+        }
82
+
83
+        let mergedContent = '';
84
+        
85
+        // 处理每个章节文件
86
+        for (const file of files) {
87
+            const filePath = path.join(sourceDir, file);
88
+            const html = await readFile(filePath, 'utf-8');
89
+            
90
+            // 提取章节标题
91
+            const titleMatch = file.match(/_([^\.]+)\.html$/);
92
+            const title = titleMatch ? titleMatch[1] : file;
93
+            
94
+            // 提取正文内容
95
+            const contentMatch = html.match(/<div id="showReading"[^>]*>([\s\S]*?)<\/div>/);
96
+            if (!contentMatch) continue;
97
+            
98
+            let content = contentMatch[1]
99
+                .replace(/<sent[^>]*>/g, '')
100
+                .replace(/<\/sent>/g, '')
101
+                .replace(/<br>/g, '\n')
102
+                .replace(/<[^>]+>/g, '')
103
+                .replace(/\(adsbygoogle\s*=\s*window\.adsbygoogle\s*\|\|\s*\[\]\).push\(\{\}\);/g, '')
104
+                .replace(/\n{3,}/g, '\n\n');
105
+            
106
+            // 添加到合并内容
107
+            mergedContent += `${title}\n\n${content.trim()}${SEPARATOR}`;
108
+            console.log(`已处理: ${file}`);
109
+        }
110
+
111
+        // 格式化合并后的内容
112
+        const formattedContent = await formatChapterContent(mergedContent);
113
+        
114
+        // 写入合并文件
115
+        await writeFile(outputFile, formattedContent);
116
+        console.log(`\n合并完成! 结果已保存到: ${outputFile}`);
117
+        console.log(`共合并了 ${files.length} 个章节`);
118
+
119
+    } catch (error) {
120
+        console.error('合并章节时出错:', error);
121
+    }
122
+}
123
+
124
+/**
125
+ * 爬取单个章节内容
126
+ * @param {string} baseUrl - 网站基础URL
127
+ * @param {string} chapterUrl - 章节URL
128
+ * @param {object} headers - 请求头
129
+ * @param {boolean} debug - 是否开启调试模式
130
+ * @returns {Promise<string|null>} - 章节内容或null
131
+ */
132
+async function fetchChapterContent(baseUrl, chapterUrl, headers, debug = false) {
133
+    try {
134
+        // 如果URL不是以http开头,则添加baseUrl
135
+        const fullUrl = chapterUrl.startsWith('http') ? chapterUrl : `${baseUrl}${chapterUrl}`;
136
+        
137
+        if (debug) {
138
+            console.log(`爬取章节内容: ${fullUrl}`);
139
+        }
140
+        
141
+        const response = await axios.get(fullUrl, { headers });
142
+        
143
+        if (response.status !== 200) {
144
+            console.error(`获取章节内容失败,状态码: ${response.status}`);
145
+            return null;
146
+        }
147
+        
148
+        const $ = cheerio.load(response.data);
149
+        
150
+        // 尝试多种选择器来获取章节内容
151
+        const contentSelectors = [
152
+            '.chapter-content',
153
+            '.article-content',
154
+            '.content',
155
+            '#content',
156
+            '.text-content',
157
+            '.chapter-text',
158
+            '.novel-content'
159
+        ];
160
+        
161
+        let content = null;
162
+        
163
+        for (const selector of contentSelectors) {
164
+            const element = $(selector);
165
+            if (element.length > 0) {
166
+                content = element.html();
167
+                if (debug) {
168
+                    console.log(`使用选择器 ${selector} 成功获取章节内容`);
169
+                }
170
+                break;
171
+            }
172
+        }
173
+        
174
+        // 如果上面的选择器都没找到内容,尝试查找包含大量文本的元素
175
+        if (!content) {
176
+            let maxTextLength = 0;
177
+            let maxTextElement = null;
178
+            
179
+            $('div, article, section, p').each((_, element) => {
180
+                const text = $(element).text().trim();
181
+                if (text.length > maxTextLength && text.length > 500) {
182
+                    maxTextLength = text.length;
183
+                    maxTextElement = element;
184
+                }
185
+            });
186
+            
187
+            if (maxTextElement) {
188
+                content = $(maxTextElement).html();
189
+                if (debug) {
190
+                    console.log(`使用最长文本元素获取章节内容,长度: ${maxTextLength}`);
191
+                }
192
+            }
193
+        }
194
+        
195
+        return content;
196
+    } catch (error) {
197
+        console.error(`爬取章节内容出错: ${error.message}`);
198
+        return null;
199
+    }
200
+}
201
+
202
+/**
203
+ * 爬取小说章节目录
204
+ * @param {string} title - 小说标题
205
+ * @param {boolean} debug - 是否开启调试模式
206
+ * @param {string} outputDir - 输出目录
207
+ * @param {boolean} fetchContent - 是否爬取章节内容
208
+ * @returns {Promise<Array|Object>} - 章节目录数组或错误对象
209
+ */
210
+async function runScript(title, debug = false, outputDir = 'src/web_crawler', fetchContent = false){
211
+    const url = `https://novelhi.com/s/index/`+title;
212
+    console.log(`正在爬取网址: ${url}`);
213
+
214
+    try {
215
+        // 设置请求头,模拟浏览器行为
216
+        const headers = {
217
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
218
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
219
+            'Accept-Language': 'en-US,en;q=0.5',
220
+            'Connection': 'keep-alive',
221
+            'Upgrade-Insecure-Requests': '1',
222
+            'Cache-Control': 'max-age=0'
223
+        };
224
+
225
+        const response = await axios.get(url, { headers });
226
+        console.log(`请求状态码: ${response.status}`);
227
+        
228
+        // 如果开启调试模式,保存HTML内容
229
+        if (debug) {
230
+            const debugFilePath = path.join(outputDir, 'debug_page.html');
231
+            fs.writeFileSync(debugFilePath, response.data);
232
+            console.log(`已保存HTML内容到 ${debugFilePath} 文件`);
233
+        }
234
+        
235
+        const $ = cheerio.load(response.data);
236
+        
237
+        // 提取章节目录
238
+        const chapters = [];
239
+        
240
+        // 如果开启调试模式,打印页面结构信息
241
+        if (debug) {
242
+            console.log('页面标题:', $('title').text());
243
+            console.log('页面主要容器数量:');
244
+            console.log('- div 元素数量:', $('div').length);
245
+            console.log('- ul 元素数量:', $('ul').length);
246
+            console.log('- li 元素数量:', $('li').length);
247
+            console.log('- a 元素数量:', $('a').length);
248
+        }
249
+        
250
+        // 尝试多种选择器
251
+        const selectors = [
252
+            '.chapter-list li',
253
+            '.book-catalog-list a',
254
+            '.catalog-list li',
255
+            '.chapter-item',
256
+            '.chapter a',
257
+            'ul.chapters li',
258
+            '.book-chapters a',
259
+            '.novel-chapters a',
260
+            'span:contains("Chapter")'
261
+        ];
262
+        
263
+        for (const selector of selectors) {
264
+            console.log(`尝试选择器: ${selector}`);
265
+            const elements = $(selector);
266
+            console.log(`找到 ${elements.length} 个元素`);
267
+            
268
+            if (elements.length > 0) {
269
+                elements.each((index, element) => {
270
+                    let chapterTitle, chapterUrl;
271
+                    
272
+                    if (selector === 'span:contains("Chapter")') {
273
+                        chapterTitle = $(element).text().trim();
274
+                        // 对于这个网站,我们可能需要构造章节URL
275
+                        chapterUrl = `/s/${title}/${chapterTitle.replace('Chapter ', '')}`;
276
+                    } else if (selector.includes('a')) {
277
+                        chapterTitle = $(element).text().trim();
278
+                        chapterUrl = $(element).attr('href');
279
+                    } else {
280
+                        chapterTitle = $(element).find('a').text().trim();
281
+                        chapterUrl = $(element).find('a').attr('href');
282
+                    }
283
+                    
284
+                    if (chapterTitle) {
285
+                        chapters.push({
286
+                            title: chapterTitle,
287
+                            url: chapterUrl || '#',
288
+                            index: index + 1
289
+                        });
290
+                    }
291
+                });
292
+                
293
+                if (chapters.length > 0) {
294
+                    console.log(`使用选择器 ${selector} 成功找到章节`);
295
+                    break;
296
+                }
297
+            }
298
+        }
299
+        
300
+        // 如果上面的选择器都没有找到章节,尝试更通用的方法
301
+        if (chapters.length === 0) {
302
+            console.log('尝试查找所有可能的章节链接...');
303
+            
304
+            // 查找所有包含"chapter"或"第"字样的链接或文本
305
+            $('*').each((index, element) => {
306
+                const text = $(element).text().trim();
307
+                
308
+                if (text && (text.includes('Chapter') || text.includes('第') || text.includes('章'))) {
309
+                    // 检查是否是单独的章节标题(不包含其他章节)
310
+                    if (text.match(/^Chapter \d+$/) || text.match(/^第[一二三四五六七八九十百千万]+章/) || text.match(/^\d+\.\s+.+$/)) {
311
+                        chapters.push({
312
+                            title: text,
313
+                            url: '#',  // 如果没有URL,使用占位符
314
+                            index: index + 1
315
+                        });
316
+                    }
317
+                }
318
+            });
319
+        }
320
+        
321
+        console.log(`共找到 ${chapters.length} 个章节`);
322
+        
323
+        // 将结果保存到文件中
324
+        const outputFilePath = path.join(outputDir, `${title}_chapters.json`);
325
+        fs.writeFileSync(outputFilePath, JSON.stringify(chapters, null, 2));
326
+        console.log(`已将章节目录保存到 ${outputFilePath} 文件`);
327
+        
328
+        // 打印前10个章节和后10个章节
329
+        if (debug || chapters.length <= 20) {
330
+            console.log("章节目录:");
331
+            console.log(JSON.stringify(chapters, null, 2));
332
+        } else {
333
+            console.log("前10个章节:");
334
+            console.log(JSON.stringify(chapters.slice(0, 10), null, 2));
335
+            
336
+            console.log("...");
337
+            
338
+            console.log("后10个章节:");
339
+            console.log(JSON.stringify(chapters.slice(-10), null, 2));
340
+        }
341
+        
342
+        // 如果需要爬取章节内容
343
+        if (fetchContent && chapters.length > 0) {
344
+            console.log(`开始爬取章节内容...`);
345
+            
346
+            // 创建章节内容目录
347
+            const contentDir = path.join(outputDir, `${title}_contents`);
348
+            if (!fs.existsSync(contentDir)) {
349
+                fs.mkdirSync(contentDir, { recursive: true });
350
+            }
351
+            
352
+            // 创建一个包含所有章节内容的对象
353
+            const contentsObj = {};
354
+            
355
+            // 设置基础URL
356
+            const baseUrl = 'https://novelhi.com';
357
+            
358
+            // 爬取所有章节
359
+            const chaptersToFetch = chapters;
360
+            console.log(`将爬取 ${chaptersToFetch.length}/${chapters.length} 个章节的内容`);
361
+            
362
+            // 爬取章节内容
363
+            for (let i = 0; i < chaptersToFetch.length; i++) {
364
+                const chapter = chaptersToFetch[i];
365
+                
366
+                // 每爬取10个章节,暂停一下,避免请求过于频繁
367
+                if (i > 0 && i % 10 === 0) {
368
+                    console.log(`已爬取 ${i}/${chaptersToFetch.length} 章节,暂停 2 秒...`);
369
+                    await setTimeout(2000);
370
+                }
371
+                
372
+                // 爬取章节内容
373
+                const content = await fetchChapterContent(baseUrl, chapter.url, headers, debug);
374
+                
375
+                if (content) {
376
+                    // 更新章节对象,添加内容
377
+                    chapters[i].content = content;
378
+                    contentsObj[chapter.title] = content;
379
+                    
380
+                    // 将章节内容保存到单独的文件
381
+                    const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
382
+                    const chapterFilePath = path.join(contentDir, chapterFileName);
383
+                    
384
+                    // 创建一个完整的HTML文件
385
+                    const htmlContent = `<!DOCTYPE html>
386
+<html>
387
+<head>
388
+    <meta charset="UTF-8">
389
+    <title>${chapter.title}</title>
390
+    <style>
391
+        body {
392
+            font-family: Arial, sans-serif;
393
+            line-height: 1.6;
394
+            margin: 0 auto;
395
+            max-width: 800px;
396
+            padding: 20px;
397
+        }
398
+        h1 {
399
+            text-align: center;
400
+            margin-bottom: 30px;
401
+        }
402
+        p {
403
+            text-indent: 2em;
404
+            margin-bottom: 1em;
405
+        }
406
+    </style>
407
+</head>
408
+<body>
409
+    <h1>${chapter.title}</h1>
410
+    ${content}
411
+</body>
412
+</html>`;
413
+                    
414
+                    fs.writeFileSync(chapterFilePath, htmlContent);
415
+                    
416
+                    if (debug) {
417
+                        console.log(`已保存章节 ${chapter.title} 到 ${chapterFilePath}`);
418
+                    } else if (i % 10 === 0 || i === chapters.length - 1) {
419
+                        console.log(`已保存 ${i + 1}/${chapters.length} 章节`);
420
+                    }
421
+                } else {
422
+                    console.error(`获取章节 ${chapter.title} 内容失败`);
423
+                }
424
+            }
425
+            
426
+            // 保存所有章节内容到一个文件
427
+            const allContentsPath = path.join(outputDir, `${title}_all_contents.json`);
428
+            fs.writeFileSync(allContentsPath, JSON.stringify(contentsObj, null, 2));
429
+            console.log(`已将所有章节内容保存到 ${allContentsPath} 文件`);
430
+            
431
+            // 更新章节目录文件,包含内容
432
+            const chaptersWithContentPath = path.join(outputDir, `${title}_chapters_with_content.json`);
433
+            fs.writeFileSync(chaptersWithContentPath, JSON.stringify(chapters, null, 2));
434
+            console.log(`已将包含内容的章节目录保存到 ${chaptersWithContentPath} 文件`);
435
+        }
436
+        
437
+        return chapters;
438
+    } catch (err) {
439
+        console.error("爬取过程中出错:", err.message);
440
+        return { errcode: 101, errStr: err.message };
441
+    } finally {
442
+        console.log("完成");
443
+    }
444
+}
445
+
446
+/**
447
+ * 显示使用帮助
448
+ */
449
+function showHelp() {
450
+    console.log(`
451
+爬虫程序使用说明:
452
+----------------
453
+用法: node crawler_yinsha.js [小说标题] [选项]
454
+
455
+参数:
456
+  小说标题            要爬取的小说标题,默认为 "Hidden-Assassin"
457
+
458
+选项:
459
+  --debug            开启调试模式,保存HTML内容并打印详细信息
460
+  --output=<目录>     指定输出目录,默认为 src/web_crawler
461
+  --content          爬取章节内容(默认只爬取目录)
462
+
463
+  --help             显示此帮助信息
464
+
465
+示例:
466
+  node crawler_yinsha.js "My-Novel-Title"
467
+  node crawler_yinsha.js "Another-Novel" --debug
468
+  node crawler_yinsha.js "Some-Book" --output=./output
469
+  node crawler_yinsha.js "Hidden-Assassin" --content
470
+  node crawler_yinsha.js "Hidden-Assassin" --content
471
+`);
472
+}
473
+
474
+/**
475
+ * 命令行入口
476
+ */
477
+let order=3;
478
+if (order==1) {
479
+    // 从命令行参数获取小说标题
480
+    const args = process.argv.slice(2);
481
+    
482
+    // 显示帮助信息
483
+    if (args.includes("--help") || args.includes("-h")) {
484
+        showHelp();
485
+        process.exit(0);
486
+    }
487
+    
488
+    // 获取参数
489
+    const title = args.filter(arg => !arg.startsWith("-"))[0] || "Hidden-Assassin";
490
+    const debug = args.includes("--debug");
491
+    const outputDir = args.find(arg => arg.startsWith("--output="))?.split("=")[1] || "src/web_crawler";
492
+    const fetchContent = args.includes("--content");
493
+    
494
+
495
+    
496
+    console.log(`开始爬取小说: ${title}`);
497
+    console.log(`调试模式: ${debug ? "开启" : "关闭"}`);
498
+    console.log(`输出目录: ${outputDir}`);
499
+    console.log(`爬取章节内容: ${fetchContent ? "是" : "否"}`);
500
+    
501
+    // 确保输出目录存在
502
+    if (!fs.existsSync(outputDir)) {
503
+        fs.mkdirSync(outputDir, { recursive: true });
504
+    }
505
+    
506
+    runScript(title, debug, outputDir, fetchContent)
507
+        .then(result => {
508
+            if (result && result.errcode) {
509
+                console.error(`爬取失败: ${result.errStr}`);
510
+                process.exit(1);
511
+            } else {
512
+                console.log("爬取任务完成!");
513
+            }
514
+        })
515
+        .catch(err => {
516
+            console.error("程序执行出错:", err);
517
+            process.exit(1);
518
+        });
519
+}
520
+else if (order==2) {
521
+    let __dirname="/Users/chengjie/Documents/git/miaoguo_system_server/src/web_crawler";
522
+    let sourceDir = path.join(__dirname, 'Hidden-Assassin_contents');
523
+    let outputFile = path.join(__dirname, 'Hidden-Assassin_merged.txt');
524
+
525
+    mergeChapterFiles(sourceDir, outputFile)
526
+    .then(() => console.log('合并操作完成'))
527
+    .catch(err => console.error('合并操作失败:', err));
528
+}
529
+else if (order==3) {
530
+    let __dirname="/Users/chengjie/Documents/git/miaoguo_system_server/src/web_crawler";
531
+    let sourceDir = path.join(__dirname, 'Hidden-Assassin.txt');
532
+    let outputFile = path.join(__dirname, 'Hidden-Assassin.html');
533
+
534
+    await formatTextFile(sourceDir, outputFile);
535
+}
536
+
537
+// 导出函数,以便其他模块使用
538
+export default runScript;

+ 249 - 0
src/web_crawler/epub_generator.js

@@ -0,0 +1,249 @@
1
+import fs from 'fs';
2
+import path from 'path';
3
+import JSZip from 'jszip';
4
+import { v4 as uuidv4 } from 'uuid';
5
+import { JSDOM } from 'jsdom';
6
+import { fileURLToPath } from 'url';
7
+
8
+// 获取当前文件的目录路径
9
+const __filename = fileURLToPath(import.meta.url);
10
+const __dirname = path.dirname(__filename);
11
+
12
+/**
13
+ * 生成 EPUB 电子书
14
+ * @param {string} contentFilePath - HTML 格式的正文文件路径
15
+ * @param {string} coverImagePath - 封面图片路径
16
+ * @param {string} outputPath - 输出 EPUB 文件路径
17
+ * @param {string} bookTitle - 电子书标题
18
+ * @param {string} author - 作者名称
19
+ * @returns {Promise<void>}
20
+ */
21
+async function generateEpub(contentFilePath, coverImagePath, outputPath, bookTitle, author) {
22
+    try {
23
+        // 读取 HTML 内容
24
+        const htmlContent = fs.readFileSync(contentFilePath, 'utf-8');
25
+        const dom = new JSDOM(htmlContent);
26
+        const document = dom.window.document;
27
+        
28
+        // 提取章节 (h2 标签)
29
+        const chapterElements = document.querySelectorAll('h2');
30
+        const chapters = [];
31
+        
32
+        // 处理每个章节
33
+        chapterElements.forEach((chapterElement, index) => {
34
+            const title = chapterElement.textContent.trim();
35
+            let content = '';
36
+            
37
+            // 收集当前章节的所有段落,直到下一个 h2 或文档结束
38
+            let currentElement = chapterElement.nextElementSibling;
39
+            while (currentElement && currentElement.tagName.toLowerCase() !== 'h2') {
40
+                if (currentElement.tagName.toLowerCase() === 'p') {
41
+                    // 每个 p 标签作为独立段落,用 <p> 标签包裹并添加样式类
42
+                    content += `<p class="calibre2">${currentElement.innerHTML}</p>\n`;
43
+                }
44
+                currentElement = currentElement.nextElementSibling;
45
+            }
46
+            
47
+            chapters.push({ title, content });
48
+        });
49
+
50
+        // 读取封面图片
51
+        const coverImage = fs.readFileSync(coverImagePath);
52
+
53
+        // 创建 EPUB 容器
54
+        const zip = new JSZip();
55
+
56
+        // 添加 mimetype 文件(必须是第一个文件,且不压缩)
57
+        zip.file('mimetype', 'application/epub+zip', { compression: 'STORE' });
58
+
59
+        // 创建 META-INF 目录
60
+        const metaInf = zip.folder('META-INF');
61
+        metaInf.file('container.xml', `<?xml version="1.0"?>
62
+<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
63
+    <rootfiles>
64
+        <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
65
+    </rootfiles>
66
+</container>`);
67
+
68
+        // 创建 OEBPS 目录
69
+        const oebps = zip.folder('OEBPS');
70
+
71
+        // 添加封面图片
72
+        oebps.file('Images/cover.jpg', coverImage);
73
+
74
+        // 生成封面页 XHTML
75
+        const coverXhtml = `<?xml version="1.0" encoding="UTF-8"?>
76
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
77
+<html xmlns="http://www.w3.org/1999/xhtml">
78
+<head>
79
+    <title>封面</title>
80
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
81
+    <style type="text/css">
82
+        body {
83
+            margin: 0;
84
+            padding: 0;
85
+            text-align: center;
86
+        }
87
+        img {
88
+            max-width: 100%;
89
+            height: auto;
90
+            margin: 0;
91
+            padding: 0;
92
+        }
93
+    </style>
94
+</head>
95
+<body>
96
+    <div>
97
+        <img src="../Images/cover.jpg" alt="封面"/>
98
+    </div>
99
+</body>
100
+</html>`;
101
+
102
+        const textFolder = oebps.folder('Text');
103
+        textFolder.file('cover.xhtml', coverXhtml);
104
+
105
+        // 生成章节 HTML 文件
106
+        const spineItems = [];
107
+        const manifestItems = [
108
+            { id: 'cover', href: 'Text/cover.xhtml', mediaType: 'application/xhtml+xml' },
109
+            { id: 'cover-image', href: 'Images/cover.jpg', mediaType: 'image/jpeg' },
110
+            { id: 'ncx', href: 'toc.ncx', mediaType: 'application/x-dtbncx+xml' }
111
+        ];
112
+
113
+        chapters.forEach((chapter, index) => {
114
+            const { title, content } = chapter;
115
+            const chapterId = `chapter_${index}`;
116
+            const chapterFileName = `${chapterId}.xhtml`;
117
+
118
+            const chapterHtml = `<?xml version="1.0" encoding="UTF-8"?>
119
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
120
+<html xmlns="http://www.w3.org/1999/xhtml">
121
+<head>
122
+    <title>${title}</title>
123
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
124
+</head>
125
+<body>
126
+    <h1>${title}</h1>
127
+    <div>
128
+        ${content}
129
+    </div>
130
+</body>
131
+</html>`;
132
+
133
+            textFolder.file(chapterFileName, chapterHtml);
134
+            manifestItems.push({ id: chapterId, href: `Text/${chapterFileName}`, mediaType: 'application/xhtml+xml' });
135
+            spineItems.push({ idref: chapterId });
136
+        });
137
+
138
+        // 生成目录文件
139
+        const tocHtml = `<?xml version="1.0" encoding="UTF-8"?>
140
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
141
+<html xmlns="http://www.w3.org/1999/xhtml">
142
+<head>
143
+    <title>目录</title>
144
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
145
+</head>
146
+<body>
147
+    <h1>目录</h1>
148
+    <ol>
149
+        ${chapters.map((chapter, index) => {
150
+            return `<li><a href="chapter_${index}.xhtml">${chapter.title}</a></li>`;
151
+        }).join('\n')}
152
+    </ol>
153
+</body>
154
+</html>`;
155
+
156
+        textFolder.file('toc.xhtml', tocHtml);
157
+        manifestItems.push({ id: 'toc', href: 'Text/toc.xhtml', mediaType: 'application/xhtml+xml' });
158
+
159
+        // 添加 CSS 文件
160
+        const cssFolder = oebps.folder('Styles');
161
+        const cssContent = fs.readFileSync(path.join(__dirname, 'epub_styles.css'), 'utf-8');
162
+        cssFolder.file('stylesheet.css', cssContent);
163
+        manifestItems.push({ id: 'stylesheet', href: 'Styles/stylesheet.css', mediaType: 'text/css' });
164
+
165
+        // 生成唯一标识符
166
+        const bookUUID = uuidv4();
167
+        
168
+        // 生成 toc.ncx 文件
169
+        const tocNcx = `<?xml version='1.0' encoding='utf-8'?>
170
+<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="zh-CN">
171
+  <head>
172
+    <meta content="${bookUUID}" name="dtb:uid"/>
173
+    <meta content="2" name="dtb:depth"/>
174
+    <meta content="0" name="dtb:totalPageCount"/>
175
+    <meta content="0" name="dtb:maxPageNumber"/>
176
+  </head>
177
+  <docTitle>
178
+    <text>${bookTitle}</text>
179
+  </docTitle>
180
+  <navMap>
181
+    <navPoint id="navpoint-0" playOrder="0">
182
+      <navLabel>
183
+        <text>封面</text>
184
+      </navLabel>
185
+      <content src="Text/cover.xhtml"/>
186
+    </navPoint>
187
+    <navPoint id="navpoint-1" playOrder="1">
188
+      <navLabel>
189
+        <text>目录</text>
190
+      </navLabel>
191
+      <content src="Text/toc.xhtml"/>
192
+    </navPoint>
193
+    ${chapters.map((chapter, index) => {
194
+        return `<navPoint id="navpoint-${index + 2}" playOrder="${index + 2}">
195
+      <navLabel>
196
+        <text>${chapter.title}</text>
197
+      </navLabel>
198
+      <content src="Text/chapter_${index}.xhtml"/>
199
+    </navPoint>`;
200
+    }).join('\n')}
201
+  </navMap>
202
+</ncx>`;
203
+
204
+        oebps.file('toc.ncx', tocNcx);
205
+
206
+        // 生成 content.opf 文件
207
+        const contentOpf = `<?xml version="1.0" encoding="UTF-8"?>
208
+<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="uuid_id">
209
+    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
210
+        <dc:identifier id="uuid_id">urn:uuid:${bookUUID}</dc:identifier>
211
+        <dc:title>${bookTitle}</dc:title>
212
+        <dc:creator>${author}</dc:creator>
213
+        <dc:language>zh-CN</dc:language>
214
+        <dc:date>${new Date().toISOString().split('T')[0]}</dc:date>
215
+        <meta name="cover" content="cover-image"/>
216
+    </metadata>
217
+    <manifest>
218
+        ${manifestItems.map(item => `<item id="${item.id}" href="${item.href}" media-type="${item.mediaType}"/>`).join('\n')}
219
+    </manifest>
220
+    <spine toc="ncx">
221
+        <itemref idref="cover"/>
222
+        <itemref idref="toc"/>
223
+        ${spineItems.map(item => `<itemref idref="${item.idref}"/>`).join('\n')}
224
+    </spine>
225
+    <guide>
226
+        <reference type="cover" title="封面" href="Text/cover.xhtml"/>
227
+    </guide>
228
+</package>`;
229
+
230
+        oebps.file('content.opf', contentOpf);
231
+
232
+        // 生成 EPUB 文件
233
+        const epubContent = await zip.generateAsync({ 
234
+            type: 'nodebuffer', 
235
+            compression: 'DEFLATE',
236
+            mimeType: 'application/epub+zip'
237
+        });
238
+        fs.writeFileSync(outputPath, epubContent);
239
+
240
+        console.log(`EPUB 电子书已生成: ${outputPath}`);
241
+    } catch (error) {
242
+        console.error('生成 EPUB 电子书时出错:', error);
243
+    }
244
+}
245
+
246
+// 示例用法
247
+generateEpub(__dirname+'/Hidden-Assassin.html', __dirname+'/cover.jpeg', 'Hidden-Assassin.epub', 'Hidden-Assassin', 'Angry Banana');
248
+
249
+export { generateEpub };

+ 32 - 0
src/web_crawler/epub_styles.css

@@ -0,0 +1,32 @@
1
+.calibre {
2
+    display: block;
3
+    font-size: 1em;
4
+    padding-left: 0;
5
+    padding-right: 0;
6
+    margin: 0 5pt
7
+}
8
+
9
+.calibre1 {
10
+    display: block;
11
+    font-size: 1.5em;
12
+    font-weight: bold;
13
+    line-height: 1.2;
14
+    margin: 0.83em 0
15
+}
16
+
17
+.calibre2 {
18
+    display: block;
19
+    font-size: 1em;
20
+    font-weight: normal;
21
+    line-height: 1.5;
22
+    margin: 1em 0
23
+}
24
+
25
+.calibre3 {
26
+    display: block
27
+}
28
+
29
+@page {
30
+    margin-bottom: 5pt;
31
+    margin-top: 5pt
32
+}