chengjie 2 miesięcy temu
rodzic
commit
2dab6b65fa

+ 3 - 1
.gitignore

@@ -13,4 +13,6 @@ src/web_crawler/Hidden-Assassin/
13 13
 src/web_crawler/Release-that-Witch/
14 14
 src/web_crawler/Strange-Life-of-a-Cat/
15 15
 src/web_crawler/Throne-of-Magical-Arcana/
16
-src/web_crawler/The-Legendary-Mechanic/
16
+src/web_crawler/The-Legendary-Mechanic/
17
+src/web_crawler/奥术神座/
18
+src/web_crawler/回到过去变成猫/

+ 19 - 5
package-lock.json

@@ -1430,11 +1430,11 @@
1430 1430
       }
1431 1431
     },
1432 1432
     "iconv-lite": {
1433
-      "version": "0.4.24",
1434
-      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
1435
-      "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
1433
+      "version": "0.7.0",
1434
+      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.0.tgz",
1435
+      "integrity": "sha512-cf6L2Ds3h57VVmkZe+Pn+5APsT7FpqJtEhhieDCvrE2MK5Qk9MyffgQyuxQTm6BChfeZNtcOLHp9IcWRVcIcBQ==",
1436 1436
       "requires": {
1437
-        "safer-buffer": ">= 2.1.2 < 3"
1437
+        "safer-buffer": ">= 2.1.2 < 3.0.0"
1438 1438
       }
1439 1439
     },
1440 1440
     "ieee754": {
@@ -1658,6 +1658,8 @@
1658 1658
     },
1659 1659
     "jszip": {
1660 1660
       "version": "3.10.1",
1661
+      "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz",
1662
+      "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==",
1661 1663
       "requires": {
1662 1664
         "lie": "~3.3.0",
1663 1665
         "pako": "~1.0.2",
@@ -2274,6 +2276,16 @@
2274 2276
         "http-errors": "2.0.0",
2275 2277
         "iconv-lite": "0.4.24",
2276 2278
         "unpipe": "1.0.0"
2279
+      },
2280
+      "dependencies": {
2281
+        "iconv-lite": {
2282
+          "version": "0.4.24",
2283
+          "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
2284
+          "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
2285
+          "requires": {
2286
+            "safer-buffer": ">= 2.1.2 < 3"
2287
+          }
2288
+        }
2277 2289
       }
2278 2290
     },
2279 2291
     "readable-stream": {
@@ -2743,7 +2755,9 @@
2743 2755
       "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="
2744 2756
     },
2745 2757
     "uuid": {
2746
-      "version": "13.0.0"
2758
+      "version": "13.0.0",
2759
+      "resolved": "https://registry.npmjs.org/uuid/-/uuid-13.0.0.tgz",
2760
+      "integrity": "sha512-XQegIaBTVUjSHliKqcnFqYypAd4S+WCYt5NIeRs6w/UAry7z8Y9j5ZwRRL4kzq9U3sD6v+85er9FvkEaBpji2w=="
2747 2761
     },
2748 2762
     "vary": {
2749 2763
       "version": "1.1.2",

+ 1 - 0
package.json

@@ -30,6 +30,7 @@
30 30
     "child_process": "^1.0.2",
31 31
     "cos-nodejs-sdk-v5": "^2.16.0-beta.3",
32 32
     "gm": "^1.25.1",
33
+    "iconv-lite": "^0.7.0",
33 34
     "jsdom": "^27.0.0",
34 35
     "jszip": "^3.10.1",
35 36
     "koa": "^2.13.4",

+ 0 - 769
src/web_crawler/crawle.js

@@ -1,769 +0,0 @@
1
-import fs from 'fs';
2
-import axios from 'axios';
3
-import * as cheerio from 'cheerio';
4
-import path from 'path';
5
-import { setTimeout } from 'timers/promises';
6
-import { fileURLToPath } from 'url';
7
-import JSZip from 'jszip';
8
-import { v4 as uuidv4 } from 'uuid';
9
-import { JSDOM } from 'jsdom';
10
-
11
-const __filename = fileURLToPath(import.meta.url);
12
-const __dirname = path.dirname(__filename);
13
-
14
-/**
15
- * 格式化章节内容
16
- * @param {string} content - 原始文本内容
17
- * @returns {string} - 格式化后的HTML内容
18
- */
19
-async function formatChapterContent(content) {
20
-    // 将章节标题替换为<h2>
21
-    content = content.replace(/Chapter \d+/g, match => `<h2>${match}</h2>`);
22
-    
23
-    // 将正文段落用<p>包裹
24
-    content = content.replace(/\n\n/g, '</p><p>');
25
-    content = `<p>${content}</p>`;
26
-    
27
-    return content;
28
-}
29
-
30
-/**
31
- * 处理文本文件,为章节标题添加h2标签,为段落添加p标签
32
- * @param {string} filePath - 文本文件路径
33
- * @param {string} outputPath - 输出文件路径
34
- * @returns {Promise<void>}
35
- */
36
-async function formatTextFile(filePath, outputPath) {
37
-    try {
38
-        // 读取文本文件
39
-        const content = await fs.promises.readFile(filePath, 'utf-8');
40
-        console.log(`已读取文件: ${filePath}`);
41
-        
42
-        // 分割成章节
43
-        const chapters = content.split(/Chapter \d+/).filter(Boolean);
44
-        console.log(`检测到 ${chapters.length} 个章节内容块`);
45
-        
46
-        let formattedContent = '';
47
-        let chapterIndex = 1;
48
-        
49
-        // 处理每个章节
50
-        for (const chapter of chapters) {
51
-            // 添加章节标题
52
-            formattedContent += `<h2>Chapter ${chapterIndex}</h2>\n`;
53
-            
54
-            // 处理章节内容,将段落用<p>标签包裹
55
-            const paragraphs = chapter.trim().split(/\n\s*\n/);
56
-            for (const paragraph of paragraphs) {
57
-                if (paragraph.trim()) {
58
-                    formattedContent += `<p>${paragraph.trim()}</p>\n`;
59
-                }
60
-            }
61
-            
62
-            chapterIndex++;
63
-        }
64
-        
65
-        // 写入输出文件
66
-        await fs.promises.writeFile(outputPath, formattedContent);
67
-        console.log(`格式化完成! 结果已保存到: ${outputPath}`);
68
-        console.log(`共处理了 ${chapters.length} 个章节`);
69
-        
70
-    } catch (error) {
71
-        console.error('处理文本文件时出错:', error);
72
-    }
73
-}
74
-
75
-async function mergeChapterFiles(sourceDir, outputFile) {
76
-    try {
77
-        const SEPARATOR = '\n******************\n';
78
-        const { readdir, readFile, writeFile } = fs.promises;
79
-
80
-        // 获取所有HTML文件并按章节顺序排序
81
-        const files = (await readdir(sourceDir))
82
-            .filter(file => file.endsWith('.html'))
83
-            .sort((a, b) => parseInt(a.split('_')[0]) - parseInt(b.split('_')[0]));
84
-
85
-        if (files.length === 0) {
86
-            console.error('未找到任何章节文件');
87
-            return;
88
-        }
89
-
90
-        let mergedContent = '';
91
-        
92
-        // 处理每个章节文件
93
-        for (const file of files) {
94
-            const filePath = path.join(sourceDir, file);
95
-            const html = await readFile(filePath, 'utf-8');
96
-            
97
-            // 提取章节标题
98
-            const titleMatch = file.match(/_([^\.]+)\.html$/);
99
-            const title = titleMatch ? titleMatch[1] : file;
100
-            
101
-            // 提取正文内容
102
-            const contentMatch = html.match(/<div id="showReading"[^>]*>([\s\S]*?)<\/div>/);
103
-            if (!contentMatch) continue;
104
-            
105
-            let content = contentMatch[1]
106
-                .replace(/<sent[^>]*>/g, '')
107
-                .replace(/<\/sent>/g, '')
108
-                .replace(/<br>/g, '\n')
109
-                .replace(/<[^>]+>/g, '')
110
-                .replace(/\(adsbygoogle\s*=\s*window\.adsbygoogle\s*\|\|\s*\[\]\).push\(\{\}\);/g, '')
111
-                .replace(/\n{3,}/g, '\n\n');
112
-            
113
-            // 添加到合并内容
114
-            mergedContent += `${title}\n\n${content.trim()}${SEPARATOR}`;
115
-            console.log(`已处理: ${file}`);
116
-        }
117
-
118
-        // 格式化合并后的内容
119
-        const formattedContent = mergedContent;
120
-        
121
-        // 写入合并文件
122
-        await writeFile(outputFile, formattedContent);
123
-        console.log(`\n合并完成! 结果已保存到: ${outputFile}`);
124
-        console.log(`共合并了 ${files.length} 个章节`);
125
-
126
-    } catch (error) {
127
-        console.error('合并章节时出错:', error);
128
-    }
129
-}
130
-
131
-/**
132
- * 爬取单个章节内容
133
- * @param {string} baseUrl - 网站基础URL
134
- * @param {string} chapterUrl - 章节URL
135
- * @param {object} headers - 请求头
136
- * @param {boolean} debug - 是否开启调试模式
137
- * @returns {Promise<string|null>} - 章节内容或null
138
- */
139
-async function fetchChapterContent(baseUrl, chapterUrl, headers, debug = false) {
140
-    try {
141
-        // 如果URL不是以http开头,则添加baseUrl
142
-        const fullUrl = chapterUrl.startsWith('http') ? chapterUrl : `${baseUrl}${chapterUrl}`;
143
-        
144
-        if (debug) {
145
-            console.log(`爬取章节内容: ${fullUrl}`);
146
-        }
147
-        
148
-        const response = await axios.get(fullUrl, { headers });
149
-        
150
-        if (response.status !== 200) {
151
-            console.error(`获取章节内容失败,状态码: ${response.status}`);
152
-            return null;
153
-        }
154
-        
155
-        const $ = cheerio.load(response.data);
156
-        
157
-        // 尝试多种选择器来获取章节内容
158
-        const contentSelectors = [
159
-            '.chapter-content',
160
-            '.article-content',
161
-            '.content',
162
-            '#content',
163
-            '.text-content',
164
-            '.chapter-text',
165
-            '.novel-content'
166
-        ];
167
-        
168
-        let content = null;
169
-        
170
-        for (const selector of contentSelectors) {
171
-            const element = $(selector);
172
-            if (element.length > 0) {
173
-                content = element.html();
174
-                if (debug) {
175
-                    console.log(`使用选择器 ${selector} 成功获取章节内容`);
176
-                }
177
-                break;
178
-            }
179
-        }
180
-        
181
-        // 如果上面的选择器都没找到内容,尝试查找包含大量文本的元素
182
-        if (!content) {
183
-            let maxTextLength = 0;
184
-            let maxTextElement = null;
185
-            
186
-            $('div, article, section, p').each((_, element) => {
187
-                const text = $(element).text().trim();
188
-                if (text.length > maxTextLength && text.length > 500) {
189
-                    maxTextLength = text.length;
190
-                    maxTextElement = element;
191
-                }
192
-            });
193
-            
194
-            if (maxTextElement) {
195
-                content = $(maxTextElement).html();
196
-                if (debug) {
197
-                    console.log(`使用最长文本元素获取章节内容,长度: ${maxTextLength}`);
198
-                }
199
-            }
200
-        }
201
-        
202
-        return content;
203
-    } catch (error) {
204
-        console.error(`爬取章节内容出错: ${error.message}`);
205
-        return null;
206
-    }
207
-}
208
-
209
-/**
210
- * 爬取小说章节目录
211
- * @param {string} title - 小说标题
212
- * @param {boolean} debug - 是否开启调试模式
213
- * @param {string} outputDir - 输出目录
214
- * @param {boolean} fetchContent - 是否爬取章节内容
215
- * @returns {Promise<Array|Object>} - 章节目录数组或错误对象
216
- */
217
-async function runScript(title, debug = false, outputDir = 'src/web_crawler', fetchContent = false){
218
-    const url = `https://novelhi.com/s/index/`+title;
219
-    console.log(`正在爬取网址: ${url}`);
220
-
221
-    try {
222
-        // 设置请求头,模拟浏览器行为
223
-        const headers = {
224
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
225
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
226
-            'Accept-Language': 'en-US,en;q=0.5',
227
-            'Connection': 'keep-alive',
228
-            'Upgrade-Insecure-Requests': '1',
229
-            'Cache-Control': 'max-age=0'
230
-        };
231
-
232
-        const response = await axios.get(url, { headers });
233
-        console.log(`请求状态码: ${response.status}`);
234
-        
235
-        // 如果开启调试模式,保存HTML内容
236
-        if (debug) {
237
-            const debugFilePath = path.join(outputDir, 'debug_page.html');
238
-            fs.writeFileSync(debugFilePath, response.data);
239
-            console.log(`已保存HTML内容到 ${debugFilePath} 文件`);
240
-        }
241
-        
242
-        const $ = cheerio.load(response.data);
243
-        
244
-        // 提取章节目录
245
-        const chapters = [];
246
-        
247
-        // 如果开启调试模式,打印页面结构信息
248
-        if (debug) {
249
-            console.log('页面标题:', $('title').text());
250
-            console.log('页面主要容器数量:');
251
-            console.log('- div 元素数量:', $('div').length);
252
-            console.log('- ul 元素数量:', $('ul').length);
253
-            console.log('- li 元素数量:', $('li').length);
254
-            console.log('- a 元素数量:', $('a').length);
255
-        }
256
-        
257
-        // 尝试多种选择器
258
-        const selectors = [
259
-            '.chapter-list li',
260
-            '.book-catalog-list a',
261
-            '.catalog-list li',
262
-            '.chapter-item',
263
-            '.chapter a',
264
-            'ul.chapters li',
265
-            '.book-chapters a',
266
-            '.novel-chapters a',
267
-            'span:contains("Chapter")'
268
-        ];
269
-        
270
-        for (const selector of selectors) {
271
-            console.log(`尝试选择器: ${selector}`);
272
-            const elements = $(selector);
273
-            console.log(`找到 ${elements.length} 个元素`);
274
-            
275
-            if (elements.length > 0) {
276
-                elements.each((index, element) => {
277
-                    let chapterTitle, chapterUrl;
278
-                    
279
-                    if (selector === 'span:contains("Chapter")') {
280
-                        chapterTitle = $(element).text().trim();
281
-                        // 对于这个网站,我们可能需要构造章节URL
282
-                        chapterUrl = `/s/${title}/${chapterTitle.replace('Chapter ', '')}`;
283
-                    } else if (selector.includes('a')) {
284
-                        chapterTitle = $(element).text().trim();
285
-                        chapterUrl = $(element).attr('href');
286
-                    } else {
287
-                        chapterTitle = $(element).find('a').text().trim();
288
-                        chapterUrl = $(element).find('a').attr('href');
289
-                    }
290
-                    
291
-                    if (chapterTitle) {
292
-                        chapters.push({
293
-                            title: chapterTitle,
294
-                            url: chapterUrl || '#',
295
-                            index: index + 1
296
-                        });
297
-                    }
298
-                });
299
-                
300
-                if (chapters.length > 0) {
301
-                    console.log(`使用选择器 ${selector} 成功找到章节`);
302
-                    break;
303
-                }
304
-            }
305
-        }
306
-        
307
-        // 如果上面的选择器都没有找到章节,尝试更通用的方法
308
-        if (chapters.length === 0) {
309
-            console.log('尝试查找所有可能的章节链接...');
310
-            
311
-            // 查找所有包含"chapter"或"第"字样的链接或文本
312
-            $('*').each((index, element) => {
313
-                const text = $(element).text().trim();
314
-                
315
-                if (text && (text.includes('Chapter') || text.includes('第') || text.includes('章'))) {
316
-                    // 检查是否是单独的章节标题(不包含其他章节)
317
-                    if (text.match(/^Chapter \d+$/) || text.match(/^第[一二三四五六七八九十百千万]+章/) || text.match(/^\d+\.\s+.+$/)) {
318
-                        chapters.push({
319
-                            title: text,
320
-                            url: '#',  // 如果没有URL,使用占位符
321
-                            index: index + 1
322
-                        });
323
-                    }
324
-                }
325
-            });
326
-        }
327
-        
328
-        console.log(`共找到 ${chapters.length} 个章节`);
329
-        
330
-        // 将结果保存到文件中
331
-        const outputFilePath = path.join(outputDir, `${title}_chapters.json`);
332
-        fs.writeFileSync(outputFilePath, JSON.stringify(chapters, null, 2));
333
-        console.log(`已将章节目录保存到 ${outputFilePath} 文件`);
334
-        
335
-        // 打印前10个章节和后10个章节
336
-        if (debug || chapters.length <= 20) {
337
-            console.log("章节目录:");
338
-            console.log(JSON.stringify(chapters, null, 2));
339
-        } else {
340
-            console.log("前10个章节:");
341
-            console.log(JSON.stringify(chapters.slice(0, 10), null, 2));
342
-            
343
-            console.log("...");
344
-            
345
-            console.log("后10个章节:");
346
-            console.log(JSON.stringify(chapters.slice(-10), null, 2));
347
-        }
348
-        
349
-        // 如果需要爬取章节内容
350
-        if (fetchContent && chapters.length > 0) {
351
-            console.log(`开始爬取章节内容...`);
352
-            
353
-            // 创建章节内容目录
354
-            const contentDir = path.join(outputDir, `${title}_contents`);
355
-            if (!fs.existsSync(contentDir)) {
356
-                fs.mkdirSync(contentDir, { recursive: true });
357
-            }
358
-            
359
-            // 创建一个包含所有章节内容的对象
360
-            const contentsObj = {};
361
-            
362
-            // 设置基础URL
363
-            const baseUrl = 'https://novelhi.com';
364
-            
365
-            // 爬取所有章节
366
-            const chaptersToFetch = chapters;
367
-            console.log(`将爬取 ${chaptersToFetch.length}/${chapters.length} 个章节的内容`);
368
-            
369
-            // 爬取章节内容
370
-            for (let i = 0; i < chaptersToFetch.length; i++) {
371
-                const chapter = chaptersToFetch[i];
372
-                
373
-                // 每爬取10个章节,暂停一下,避免请求过于频繁
374
-                if (i > 0 && i % 10 === 0) {
375
-                    console.log(`已爬取 ${i}/${chaptersToFetch.length} 章节,暂停 2 秒...`);
376
-                    await setTimeout(2000);
377
-                }
378
-                
379
-                // 爬取章节内容
380
-                const content = await fetchChapterContent(baseUrl, chapter.url, headers, debug);
381
-                
382
-                if (content) {
383
-                    // 更新章节对象,添加内容
384
-                    chapters[i].content = content;
385
-                    contentsObj[chapter.title] = content;
386
-                    
387
-                    // 将章节内容保存到单独的文件
388
-                    const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
389
-                    const chapterFilePath = path.join(contentDir, chapterFileName);
390
-                    
391
-                    // 创建一个完整的HTML文件
392
-                    const htmlContent = `<!DOCTYPE html>
393
-<html>
394
-<head>
395
-    <meta charset="UTF-8">
396
-    <title>${chapter.title}</title>
397
-    <style>
398
-        body {
399
-            font-family: Arial, sans-serif;
400
-            line-height: 1.6;
401
-            margin: 0 auto;
402
-            max-width: 800px;
403
-            padding: 20px;
404
-        }
405
-        h1 {
406
-            text-align: center;
407
-            margin-bottom: 30px;
408
-        }
409
-        p {
410
-            text-indent: 2em;
411
-            margin-bottom: 1em;
412
-        }
413
-    </style>
414
-</head>
415
-<body>
416
-    <h1>${chapter.title}</h1>
417
-    ${content}
418
-</body>
419
-</html>`;
420
-                    
421
-                    fs.writeFileSync(chapterFilePath, htmlContent);
422
-                    
423
-                    if (debug) {
424
-                        console.log(`已保存章节 ${chapter.title} 到 ${chapterFilePath}`);
425
-                    } else if (i % 10 === 0 || i === chapters.length - 1) {
426
-                        console.log(`已保存 ${i + 1}/${chapters.length} 章节`);
427
-                    }
428
-                } else {
429
-                    console.error(`获取章节 ${chapter.title} 内容失败`);
430
-                }
431
-            }
432
-            
433
-            // 保存所有章节内容到一个文件
434
-            const allContentsPath = path.join(outputDir, `${title}_all_contents.json`);
435
-            fs.writeFileSync(allContentsPath, JSON.stringify(contentsObj, null, 2));
436
-            console.log(`已将所有章节内容保存到 ${allContentsPath} 文件`);
437
-            
438
-            // 更新章节目录文件,包含内容
439
-            const chaptersWithContentPath = path.join(outputDir, `${title}_chapters_with_content.json`);
440
-            fs.writeFileSync(chaptersWithContentPath, JSON.stringify(chapters, null, 2));
441
-            console.log(`已将包含内容的章节目录保存到 ${chaptersWithContentPath} 文件`);
442
-        }
443
-        
444
-        return chapters;
445
-    } catch (err) {
446
-        console.error("爬取过程中出错:", err.message);
447
-        return { errcode: 101, errStr: err.message };
448
-    } finally {
449
-        console.log("完成");
450
-    }
451
-}
452
-
453
-/**
454
- * 生成 EPUB 电子书
455
- * @param {string} contentFilePath - HTML 格式的正文文件路径
456
- * @param {string} coverImagePath - 封面图片路径
457
- * @param {string} outputPath - 输出 EPUB 文件路径
458
- * @param {string} bookTitle - 电子书标题
459
- * @param {string} author - 作者名称
460
- * @returns {Promise<void>}
461
- */
462
-async function generateEpub(contentFilePath, coverImagePath, outputPath, bookTitle, author) {
463
-    try {
464
-        // 读取 HTML 内容
465
-        const htmlContent = fs.readFileSync(contentFilePath, 'utf-8');
466
-        const dom = new JSDOM(htmlContent);
467
-        const document = dom.window.document;
468
-        
469
-        // 提取章节 (h2 标签)
470
-        const chapterElements = document.querySelectorAll('h2');
471
-        const chapters = [];
472
-        
473
-        // 处理每个章节
474
-        chapterElements.forEach((chapterElement, index) => {
475
-            const title = chapterElement.textContent.trim();
476
-            let content = '';
477
-            
478
-            // 收集当前章节的所有段落,直到下一个 h2 或文档结束
479
-            let currentElement = chapterElement.nextElementSibling;
480
-            while (currentElement && currentElement.tagName.toLowerCase() !== 'h2') {
481
-                if (currentElement.tagName.toLowerCase() === 'p') {
482
-                    // 每个 p 标签作为独立段落,用 <p> 标签包裹并添加样式类
483
-                    content += `<p>${currentElement.innerHTML}</p>\n`;
484
-                }
485
-                currentElement = currentElement.nextElementSibling;
486
-            }
487
-            
488
-            chapters.push({ title, content });
489
-        });
490
-
491
-        // 读取封面图片
492
-        const coverImage = fs.readFileSync(coverImagePath);
493
-
494
-        // 创建 EPUB 容器
495
-        const zip = new JSZip();
496
-
497
-        // 添加 mimetype 文件(必须是第一个文件,且不压缩)
498
-        zip.file('mimetype', 'application/epub+zip', { compression: 'STORE' });
499
-
500
-        // 创建 META-INF 目录
501
-        const metaInf = zip.folder('META-INF');
502
-        metaInf.file('container.xml', `<?xml version="1.0"?>
503
-<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
504
-    <rootfiles>
505
-        <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
506
-    </rootfiles>
507
-</container>`);
508
-
509
-        // 创建 OEBPS 目录
510
-        const oebps = zip.folder('OEBPS');
511
-
512
-        // 添加封面图片
513
-        oebps.file('Images/cover.jpg', coverImage);
514
-
515
-        // 生成封面页 XHTML
516
-        const coverXhtml = `<?xml version="1.0" encoding="UTF-8"?>
517
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
518
-<html xmlns="http://www.w3.org/1999/xhtml">
519
-<head>
520
-    <title>封面</title>
521
-    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
522
-    <meta name="calibre:cover" content="true"/>
523
-    <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
524
-    <style type="text/css">
525
-        body {
526
-            margin: 0;
527
-            padding: 0;
528
-            text-align: center;
529
-        }
530
-        img {
531
-            max-width: 100%;
532
-            height: auto;
533
-            margin: 0;
534
-            padding: 0;
535
-        }
536
-    </style>
537
-</head>
538
-<body>
539
-    <div class="cover-container">
540
-        <img src="../Images/cover.jpg" alt="封面"/>
541
-    </div>
542
-</body>
543
-</html>`;
544
-
545
-        const textFolder = oebps.folder('Text');
546
-        textFolder.file('cover.xhtml', coverXhtml);
547
-
548
-        // 生成章节 HTML 文件
549
-        const spineItems = [
550
-            { idref: 'cover', linear: 'no' }
551
-        ];
552
-        const manifestItems = [
553
-            { id: 'cover', href: 'Text/cover.xhtml', mediaType: 'application/xhtml+xml', properties: 'cover-image' },
554
-            { id: 'cover-image', href: 'Images/cover.jpg', mediaType: 'image/jpeg' },
555
-            { id: 'ncx', href: 'toc.ncx', mediaType: 'application/x-dtbncx+xml' }
556
-        ];
557
-
558
-        chapters.forEach((chapter, index) => {
559
-            const { title, content } = chapter;
560
-            const chapterId = `chapter_${index}`;
561
-            const chapterFileName = `${chapterId}.xhtml`;
562
-
563
-            const chapterHtml = `<?xml version="1.0" encoding="UTF-8"?>
564
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
565
-<html xmlns="http://www.w3.org/1999/xhtml">
566
-<head>
567
-    <title>${title}</title>
568
-    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
569
-    <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
570
-</head>
571
-<body>
572
-    <h2 class="chapter-title">${title}</h2>
573
-    <div class="chapter-content">
574
-        ${content}
575
-    </div>
576
-</body>
577
-</html>`;
578
-
579
-            textFolder.file(chapterFileName, chapterHtml);
580
-            manifestItems.push({ id: chapterId, href: `Text/${chapterFileName}`, mediaType: 'application/xhtml+xml' });
581
-            spineItems.push({ idref: chapterId });
582
-        });
583
-
584
-        // 生成目录文件
585
-        const tocHtml = `<?xml version="1.0" encoding="UTF-8"?>
586
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
587
-<html xmlns="http://www.w3.org/1999/xhtml">
588
-<head>
589
-    <title>目录</title>
590
-    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
591
-    <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
592
-</head>
593
-<body>
594
-    <h2 class="toc-title">目录</h2>
595
-    <ol class="toc-list">
596
-        ${chapters.map((chapter, index) => {
597
-            return `<li class="toc-item"><a href="chapter_${index}.xhtml">${chapter.title}</a></li>`;
598
-        }).join('\n')}
599
-    </ol>
600
-</body>
601
-</html>`;
602
-
603
-        textFolder.file('toc.xhtml', tocHtml);
604
-        manifestItems.push({ id: 'toc', href: 'Text/toc.xhtml', mediaType: 'application/xhtml+xml' });
605
-
606
-        // 添加 CSS 文件
607
-        const cssFolder = oebps.folder('Styles');
608
-        const csspath=path.join(__dirname, 'epub_styles.css');
609
-        console.log("🚀 ~ generateEpub ~ csspath:", csspath)
610
-        
611
-        const cssContent = fs.readFileSync(csspath, 'utf-8');
612
-        cssFolder.file('stylesheet.css', cssContent);
613
-        manifestItems.push({ id: 'stylesheet', href: 'Styles/stylesheet.css', mediaType: 'text/css' });
614
-
615
-        // 生成唯一标识符
616
-        const bookUUID = uuidv4();
617
-        
618
-        // 生成 toc.ncx 文件
619
-        const tocNcx = `<?xml version='1.0' encoding='utf-8'?>
620
-<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="zh-CN">
621
-  <head>
622
-    <meta content="${bookUUID}" name="dtb:uid"/>
623
-    <meta content="2" name="dtb:depth"/>
624
-    <meta content="0" name="dtb:totalPageCount"/>
625
-    <meta content="0" name="dtb:maxPageNumber"/>
626
-  </head>
627
-  <docTitle>
628
-    <text>${bookTitle}</text>
629
-  </docTitle>
630
-  <navMap>
631
-    <navPoint id="navpoint-0" playOrder="0">
632
-      <navLabel>
633
-        <text>封面</text>
634
-      </navLabel>
635
-      <content src="Text/cover.xhtml"/>
636
-    </navPoint>
637
-    <navPoint id="navpoint-1" playOrder="1">
638
-      <navLabel>
639
-        <text>目录</text>
640
-      </navLabel>
641
-      <content src="Text/toc.xhtml"/>
642
-    </navPoint>
643
-    ${chapters.map((chapter, index) => {
644
-        return `<navPoint id="navpoint-${index + 2}" playOrder="${index + 2}">
645
-      <navLabel>
646
-        <text>${chapter.title}</text>
647
-      </navLabel>
648
-      <content src="Text/chapter_${index}.xhtml"/>
649
-    </navPoint>`;
650
-    }).join('\n')}
651
-  </navMap>
652
-</ncx>`;
653
-
654
-        oebps.file('toc.ncx', tocNcx);
655
-
656
-        // 生成 content.opf 文件
657
-        const contentOpf = `<?xml version="1.0" encoding="UTF-8"?>
658
-<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="uuid_id">
659
-    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
660
-        <dc:identifier id="uuid_id">urn:uuid:${bookUUID}</dc:identifier>
661
-        <dc:title>${bookTitle}</dc:title>
662
-        <dc:creator>${author}</dc:creator>
663
-        <dc:language>zh-CN</dc:language>
664
-        <dc:date>${new Date().toISOString().split('T')[0]}</dc:date>
665
-        <meta name="cover" content="cover-image"/>
666
-    </metadata>
667
-    <manifest>
668
-        ${manifestItems.map(item => `<item id="${item.id}" href="${item.href}" media-type="${item.mediaType}"/>`).join('\n')}
669
-    </manifest>
670
-    <spine toc="ncx">
671
-        <itemref idref="cover"/>
672
-        <itemref idref="toc"/>
673
-        ${spineItems.map(item => `<itemref idref="${item.idref}"/>`).join('\n')}
674
-    </spine>
675
-    <guide>
676
-        <reference type="cover" title="封面" href="Text/cover.xhtml"/>
677
-    </guide>
678
-</package>`;
679
-
680
-        oebps.file('content.opf', contentOpf);
681
-
682
-        // 生成 EPUB 文件
683
-        const epubContent = await zip.generateAsync({ 
684
-            type: 'nodebuffer', 
685
-            compression: 'DEFLATE',
686
-            mimeType: 'application/epub+zip'
687
-        });
688
-        fs.writeFileSync(outputPath, epubContent);
689
-
690
-        console.log(`EPUB 电子书已生成: ${outputPath}`);
691
-    } catch (error) {
692
-        console.error('生成 EPUB 电子书时出错:', error);
693
-    }
694
-}
695
-
696
-
697
-/**
698
- * 命令行入口
699
- */
700
-
701
-//const title = "Zhui-Xu";
702
-const title="Throne-of-Magical-Arcana";
703
-//const title = 'Release-that-Witch';
704
-//const title = 'Strange-Life-of-a-Cat';
705
-//const title = "Hidden-Assassin";
706
-//const author = "Angry Banana";
707
-const author = "Cuttlefish That Loves Diving";
708
-const coverName = "cover.jpg";
709
-
710
-// 从命令行参数获取小说标题
711
-
712
-const args = process.argv.slice(2);
713
-
714
-// 显示帮助信息
715
-if (args.includes("--help") || args.includes("-h")) {
716
-    showHelp();
717
-    process.exit(0);
718
-}
719
-
720
-// 获取参数
721
-const debug = args.includes("--debug");
722
-const outputDir = args.find(arg => arg.startsWith("--output="))?.split("=")[1] || title;
723
-const fetchContent = 1;
724
-
725
-console.log(`开始爬取小说: ${title}`);
726
-console.log(`调试模式: ${debug ? "开启" : "关闭"}`);
727
-console.log(`输出目录: ${outputDir}`);
728
-console.log(`爬取章节内容: ${fetchContent ? "是" : "否"}`);
729
-
730
-// 确保输出目录存在
731
-if (!fs.existsSync(outputDir)) {
732
-    fs.mkdirSync(outputDir, { recursive: true });
733
-}
734
-
735
-if (1==1){
736
-await runScript(title, debug, outputDir, fetchContent)
737
-    .then(result => {
738
-        if (result && result.errcode) {
739
-            console.error(`爬取失败: ${result.errStr}`);
740
-            process.exit(1);
741
-        } else {
742
-            
743
-            console.log("爬取任务完成!");
744
-        }
745
-    })
746
-    .catch(err => {
747
-        console.error("程序执行出错:", err);
748
-        process.exit(1);
749
-    });
750
-}
751
-
752
-let sourceDir = path.join(__dirname, title+'/'+title+'_contents');
753
-let outputFile = path.join(__dirname, title+'/'+title+'.txt');
754
-let outputFile2 = path.join(__dirname, title+'/'+title+'.html');
755
-let coverFile=path.join(__dirname, title+'/'+coverName);
756
-let epubFile=path.join(__dirname, title+'/'+title+'.epub');
757
-
758
-if (1==1){
759
-    await mergeChapterFiles(sourceDir, outputFile)
760
-    .then(() => console.log('合并操作完成'))
761
-    .catch(err => console.error('合并操作失败:', err));
762
-
763
-    await formatTextFile(outputFile, outputFile2);
764
-}
765
-
766
-await generateEpub(outputFile2, coverFile, epubFile, title, author);
767
-
768
-// 导出函数,以便其他模块使用
769
-export default runScript;

Plik diff jest za duży
+ 1157 - 0
src/web_crawler/crawle_chinese.js


+ 4 - 3
src/web_crawler/crawle_english.js

@@ -11,7 +11,8 @@ import { JSDOM } from 'jsdom';
11 11
 const __filename = fileURLToPath(import.meta.url);
12 12
 const __dirname = path.dirname(__filename);
13 13
 
14
-
14
+const WEB_URL=`https://novelhi.com`;
15
+const WEB_URL_INDEX=`/s/index/`;
15 16
 
16 17
 /**
17 18
  * 爬取单个章节内容
@@ -94,7 +95,7 @@ async function fetchChapterContent(baseUrl, chapterUrl, headers) {
94 95
  * @returns {Promise<Array|Object>} - 章节目录数组或错误对象
95 96
  */
96 97
 async function crawleWeb(title, outputDir = 'src/web_crawler', fetchContent = false) {
97
-    const url = `https://novelhi.com/s/index/` + title;
98
+    const url = WEB_URL + WEB_URL_INDEX + title;
98 99
     console.log(`正在爬取网址: ${url}`);
99 100
 
100 101
     try {
@@ -222,7 +223,7 @@ async function crawleWeb(title, outputDir = 'src/web_crawler', fetchContent = fa
222 223
             const contentsObj = {};
223 224
 
224 225
             // 设置基础URL
225
-            const baseUrl = 'https://novelhi.com';
226
+            const baseUrl = WEB_URL;
226 227
 
227 228
             // 爬取所有章节
228 229
             const chaptersToFetch = chapters;