crawle.js 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769
  1. import fs from 'fs';
  2. import axios from 'axios';
  3. import * as cheerio from 'cheerio';
  4. import path from 'path';
  5. import { setTimeout } from 'timers/promises';
  6. import { fileURLToPath } from 'url';
  7. import JSZip from 'jszip';
  8. import { v4 as uuidv4 } from 'uuid';
  9. import { JSDOM } from 'jsdom';
  10. const __filename = fileURLToPath(import.meta.url);
  11. const __dirname = path.dirname(__filename);
  12. /**
  13. * 格式化章节内容
  14. * @param {string} content - 原始文本内容
  15. * @returns {string} - 格式化后的HTML内容
  16. */
  17. async function formatChapterContent(content) {
  18. // 将章节标题替换为<h2>
  19. content = content.replace(/Chapter \d+/g, match => `<h2>${match}</h2>`);
  20. // 将正文段落用<p>包裹
  21. content = content.replace(/\n\n/g, '</p><p>');
  22. content = `<p>${content}</p>`;
  23. return content;
  24. }
  25. /**
  26. * 处理文本文件,为章节标题添加h2标签,为段落添加p标签
  27. * @param {string} filePath - 文本文件路径
  28. * @param {string} outputPath - 输出文件路径
  29. * @returns {Promise<void>}
  30. */
  31. async function formatTextFile(filePath, outputPath) {
  32. try {
  33. // 读取文本文件
  34. const content = await fs.promises.readFile(filePath, 'utf-8');
  35. console.log(`已读取文件: ${filePath}`);
  36. // 分割成章节
  37. const chapters = content.split(/Chapter \d+/).filter(Boolean);
  38. console.log(`检测到 ${chapters.length} 个章节内容块`);
  39. let formattedContent = '';
  40. let chapterIndex = 1;
  41. // 处理每个章节
  42. for (const chapter of chapters) {
  43. // 添加章节标题
  44. formattedContent += `<h2>Chapter ${chapterIndex}</h2>\n`;
  45. // 处理章节内容,将段落用<p>标签包裹
  46. const paragraphs = chapter.trim().split(/\n\s*\n/);
  47. for (const paragraph of paragraphs) {
  48. if (paragraph.trim()) {
  49. formattedContent += `<p>${paragraph.trim()}</p>\n`;
  50. }
  51. }
  52. chapterIndex++;
  53. }
  54. // 写入输出文件
  55. await fs.promises.writeFile(outputPath, formattedContent);
  56. console.log(`格式化完成! 结果已保存到: ${outputPath}`);
  57. console.log(`共处理了 ${chapters.length} 个章节`);
  58. } catch (error) {
  59. console.error('处理文本文件时出错:', error);
  60. }
  61. }
  62. async function mergeChapterFiles(sourceDir, outputFile) {
  63. try {
  64. const SEPARATOR = '\n******************\n';
  65. const { readdir, readFile, writeFile } = fs.promises;
  66. // 获取所有HTML文件并按章节顺序排序
  67. const files = (await readdir(sourceDir))
  68. .filter(file => file.endsWith('.html'))
  69. .sort((a, b) => parseInt(a.split('_')[0]) - parseInt(b.split('_')[0]));
  70. if (files.length === 0) {
  71. console.error('未找到任何章节文件');
  72. return;
  73. }
  74. let mergedContent = '';
  75. // 处理每个章节文件
  76. for (const file of files) {
  77. const filePath = path.join(sourceDir, file);
  78. const html = await readFile(filePath, 'utf-8');
  79. // 提取章节标题
  80. const titleMatch = file.match(/_([^\.]+)\.html$/);
  81. const title = titleMatch ? titleMatch[1] : file;
  82. // 提取正文内容
  83. const contentMatch = html.match(/<div id="showReading"[^>]*>([\s\S]*?)<\/div>/);
  84. if (!contentMatch) continue;
  85. let content = contentMatch[1]
  86. .replace(/<sent[^>]*>/g, '')
  87. .replace(/<\/sent>/g, '')
  88. .replace(/<br>/g, '\n')
  89. .replace(/<[^>]+>/g, '')
  90. .replace(/\(adsbygoogle\s*=\s*window\.adsbygoogle\s*\|\|\s*\[\]\).push\(\{\}\);/g, '')
  91. .replace(/\n{3,}/g, '\n\n');
  92. // 添加到合并内容
  93. mergedContent += `${title}\n\n${content.trim()}${SEPARATOR}`;
  94. console.log(`已处理: ${file}`);
  95. }
  96. // 格式化合并后的内容
  97. const formattedContent = mergedContent;
  98. // 写入合并文件
  99. await writeFile(outputFile, formattedContent);
  100. console.log(`\n合并完成! 结果已保存到: ${outputFile}`);
  101. console.log(`共合并了 ${files.length} 个章节`);
  102. } catch (error) {
  103. console.error('合并章节时出错:', error);
  104. }
  105. }
  106. /**
  107. * 爬取单个章节内容
  108. * @param {string} baseUrl - 网站基础URL
  109. * @param {string} chapterUrl - 章节URL
  110. * @param {object} headers - 请求头
  111. * @param {boolean} debug - 是否开启调试模式
  112. * @returns {Promise<string|null>} - 章节内容或null
  113. */
  114. async function fetchChapterContent(baseUrl, chapterUrl, headers, debug = false) {
  115. try {
  116. // 如果URL不是以http开头,则添加baseUrl
  117. const fullUrl = chapterUrl.startsWith('http') ? chapterUrl : `${baseUrl}${chapterUrl}`;
  118. if (debug) {
  119. console.log(`爬取章节内容: ${fullUrl}`);
  120. }
  121. const response = await axios.get(fullUrl, { headers });
  122. if (response.status !== 200) {
  123. console.error(`获取章节内容失败,状态码: ${response.status}`);
  124. return null;
  125. }
  126. const $ = cheerio.load(response.data);
  127. // 尝试多种选择器来获取章节内容
  128. const contentSelectors = [
  129. '.chapter-content',
  130. '.article-content',
  131. '.content',
  132. '#content',
  133. '.text-content',
  134. '.chapter-text',
  135. '.novel-content'
  136. ];
  137. let content = null;
  138. for (const selector of contentSelectors) {
  139. const element = $(selector);
  140. if (element.length > 0) {
  141. content = element.html();
  142. if (debug) {
  143. console.log(`使用选择器 ${selector} 成功获取章节内容`);
  144. }
  145. break;
  146. }
  147. }
  148. // 如果上面的选择器都没找到内容,尝试查找包含大量文本的元素
  149. if (!content) {
  150. let maxTextLength = 0;
  151. let maxTextElement = null;
  152. $('div, article, section, p').each((_, element) => {
  153. const text = $(element).text().trim();
  154. if (text.length > maxTextLength && text.length > 500) {
  155. maxTextLength = text.length;
  156. maxTextElement = element;
  157. }
  158. });
  159. if (maxTextElement) {
  160. content = $(maxTextElement).html();
  161. if (debug) {
  162. console.log(`使用最长文本元素获取章节内容,长度: ${maxTextLength}`);
  163. }
  164. }
  165. }
  166. return content;
  167. } catch (error) {
  168. console.error(`爬取章节内容出错: ${error.message}`);
  169. return null;
  170. }
  171. }
  172. /**
  173. * 爬取小说章节目录
  174. * @param {string} title - 小说标题
  175. * @param {boolean} debug - 是否开启调试模式
  176. * @param {string} outputDir - 输出目录
  177. * @param {boolean} fetchContent - 是否爬取章节内容
  178. * @returns {Promise<Array|Object>} - 章节目录数组或错误对象
  179. */
  180. async function runScript(title, debug = false, outputDir = 'src/web_crawler', fetchContent = false){
  181. const url = `https://novelhi.com/s/index/`+title;
  182. console.log(`正在爬取网址: ${url}`);
  183. try {
  184. // 设置请求头,模拟浏览器行为
  185. const headers = {
  186. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
  187. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  188. 'Accept-Language': 'en-US,en;q=0.5',
  189. 'Connection': 'keep-alive',
  190. 'Upgrade-Insecure-Requests': '1',
  191. 'Cache-Control': 'max-age=0'
  192. };
  193. const response = await axios.get(url, { headers });
  194. console.log(`请求状态码: ${response.status}`);
  195. // 如果开启调试模式,保存HTML内容
  196. if (debug) {
  197. const debugFilePath = path.join(outputDir, 'debug_page.html');
  198. fs.writeFileSync(debugFilePath, response.data);
  199. console.log(`已保存HTML内容到 ${debugFilePath} 文件`);
  200. }
  201. const $ = cheerio.load(response.data);
  202. // 提取章节目录
  203. const chapters = [];
  204. // 如果开启调试模式,打印页面结构信息
  205. if (debug) {
  206. console.log('页面标题:', $('title').text());
  207. console.log('页面主要容器数量:');
  208. console.log('- div 元素数量:', $('div').length);
  209. console.log('- ul 元素数量:', $('ul').length);
  210. console.log('- li 元素数量:', $('li').length);
  211. console.log('- a 元素数量:', $('a').length);
  212. }
  213. // 尝试多种选择器
  214. const selectors = [
  215. '.chapter-list li',
  216. '.book-catalog-list a',
  217. '.catalog-list li',
  218. '.chapter-item',
  219. '.chapter a',
  220. 'ul.chapters li',
  221. '.book-chapters a',
  222. '.novel-chapters a',
  223. 'span:contains("Chapter")'
  224. ];
  225. for (const selector of selectors) {
  226. console.log(`尝试选择器: ${selector}`);
  227. const elements = $(selector);
  228. console.log(`找到 ${elements.length} 个元素`);
  229. if (elements.length > 0) {
  230. elements.each((index, element) => {
  231. let chapterTitle, chapterUrl;
  232. if (selector === 'span:contains("Chapter")') {
  233. chapterTitle = $(element).text().trim();
  234. // 对于这个网站,我们可能需要构造章节URL
  235. chapterUrl = `/s/${title}/${chapterTitle.replace('Chapter ', '')}`;
  236. } else if (selector.includes('a')) {
  237. chapterTitle = $(element).text().trim();
  238. chapterUrl = $(element).attr('href');
  239. } else {
  240. chapterTitle = $(element).find('a').text().trim();
  241. chapterUrl = $(element).find('a').attr('href');
  242. }
  243. if (chapterTitle) {
  244. chapters.push({
  245. title: chapterTitle,
  246. url: chapterUrl || '#',
  247. index: index + 1
  248. });
  249. }
  250. });
  251. if (chapters.length > 0) {
  252. console.log(`使用选择器 ${selector} 成功找到章节`);
  253. break;
  254. }
  255. }
  256. }
  257. // 如果上面的选择器都没有找到章节,尝试更通用的方法
  258. if (chapters.length === 0) {
  259. console.log('尝试查找所有可能的章节链接...');
  260. // 查找所有包含"chapter"或"第"字样的链接或文本
  261. $('*').each((index, element) => {
  262. const text = $(element).text().trim();
  263. if (text && (text.includes('Chapter') || text.includes('第') || text.includes('章'))) {
  264. // 检查是否是单独的章节标题(不包含其他章节)
  265. if (text.match(/^Chapter \d+$/) || text.match(/^第[一二三四五六七八九十百千万]+章/) || text.match(/^\d+\.\s+.+$/)) {
  266. chapters.push({
  267. title: text,
  268. url: '#', // 如果没有URL,使用占位符
  269. index: index + 1
  270. });
  271. }
  272. }
  273. });
  274. }
  275. console.log(`共找到 ${chapters.length} 个章节`);
  276. // 将结果保存到文件中
  277. const outputFilePath = path.join(outputDir, `${title}_chapters.json`);
  278. fs.writeFileSync(outputFilePath, JSON.stringify(chapters, null, 2));
  279. console.log(`已将章节目录保存到 ${outputFilePath} 文件`);
  280. // 打印前10个章节和后10个章节
  281. if (debug || chapters.length <= 20) {
  282. console.log("章节目录:");
  283. console.log(JSON.stringify(chapters, null, 2));
  284. } else {
  285. console.log("前10个章节:");
  286. console.log(JSON.stringify(chapters.slice(0, 10), null, 2));
  287. console.log("...");
  288. console.log("后10个章节:");
  289. console.log(JSON.stringify(chapters.slice(-10), null, 2));
  290. }
  291. // 如果需要爬取章节内容
  292. if (fetchContent && chapters.length > 0) {
  293. console.log(`开始爬取章节内容...`);
  294. // 创建章节内容目录
  295. const contentDir = path.join(outputDir, `${title}_contents`);
  296. if (!fs.existsSync(contentDir)) {
  297. fs.mkdirSync(contentDir, { recursive: true });
  298. }
  299. // 创建一个包含所有章节内容的对象
  300. const contentsObj = {};
  301. // 设置基础URL
  302. const baseUrl = 'https://novelhi.com';
  303. // 爬取所有章节
  304. const chaptersToFetch = chapters;
  305. console.log(`将爬取 ${chaptersToFetch.length}/${chapters.length} 个章节的内容`);
  306. // 爬取章节内容
  307. for (let i = 0; i < chaptersToFetch.length; i++) {
  308. const chapter = chaptersToFetch[i];
  309. // 每爬取10个章节,暂停一下,避免请求过于频繁
  310. if (i > 0 && i % 10 === 0) {
  311. console.log(`已爬取 ${i}/${chaptersToFetch.length} 章节,暂停 2 秒...`);
  312. await setTimeout(2000);
  313. }
  314. // 爬取章节内容
  315. const content = await fetchChapterContent(baseUrl, chapter.url, headers, debug);
  316. if (content) {
  317. // 更新章节对象,添加内容
  318. chapters[i].content = content;
  319. contentsObj[chapter.title] = content;
  320. // 将章节内容保存到单独的文件
  321. const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
  322. const chapterFilePath = path.join(contentDir, chapterFileName);
  323. // 创建一个完整的HTML文件
  324. const htmlContent = `<!DOCTYPE html>
  325. <html>
  326. <head>
  327. <meta charset="UTF-8">
  328. <title>${chapter.title}</title>
  329. <style>
  330. body {
  331. font-family: Arial, sans-serif;
  332. line-height: 1.6;
  333. margin: 0 auto;
  334. max-width: 800px;
  335. padding: 20px;
  336. }
  337. h1 {
  338. text-align: center;
  339. margin-bottom: 30px;
  340. }
  341. p {
  342. text-indent: 2em;
  343. margin-bottom: 1em;
  344. }
  345. </style>
  346. </head>
  347. <body>
  348. <h1>${chapter.title}</h1>
  349. ${content}
  350. </body>
  351. </html>`;
  352. fs.writeFileSync(chapterFilePath, htmlContent);
  353. if (debug) {
  354. console.log(`已保存章节 ${chapter.title} 到 ${chapterFilePath}`);
  355. } else if (i % 10 === 0 || i === chapters.length - 1) {
  356. console.log(`已保存 ${i + 1}/${chapters.length} 章节`);
  357. }
  358. } else {
  359. console.error(`获取章节 ${chapter.title} 内容失败`);
  360. }
  361. }
  362. // 保存所有章节内容到一个文件
  363. const allContentsPath = path.join(outputDir, `${title}_all_contents.json`);
  364. fs.writeFileSync(allContentsPath, JSON.stringify(contentsObj, null, 2));
  365. console.log(`已将所有章节内容保存到 ${allContentsPath} 文件`);
  366. // 更新章节目录文件,包含内容
  367. const chaptersWithContentPath = path.join(outputDir, `${title}_chapters_with_content.json`);
  368. fs.writeFileSync(chaptersWithContentPath, JSON.stringify(chapters, null, 2));
  369. console.log(`已将包含内容的章节目录保存到 ${chaptersWithContentPath} 文件`);
  370. }
  371. return chapters;
  372. } catch (err) {
  373. console.error("爬取过程中出错:", err.message);
  374. return { errcode: 101, errStr: err.message };
  375. } finally {
  376. console.log("完成");
  377. }
  378. }
  379. /**
  380. * 生成 EPUB 电子书
  381. * @param {string} contentFilePath - HTML 格式的正文文件路径
  382. * @param {string} coverImagePath - 封面图片路径
  383. * @param {string} outputPath - 输出 EPUB 文件路径
  384. * @param {string} bookTitle - 电子书标题
  385. * @param {string} author - 作者名称
  386. * @returns {Promise<void>}
  387. */
  388. async function generateEpub(contentFilePath, coverImagePath, outputPath, bookTitle, author) {
  389. try {
  390. // 读取 HTML 内容
  391. const htmlContent = fs.readFileSync(contentFilePath, 'utf-8');
  392. const dom = new JSDOM(htmlContent);
  393. const document = dom.window.document;
  394. // 提取章节 (h2 标签)
  395. const chapterElements = document.querySelectorAll('h2');
  396. const chapters = [];
  397. // 处理每个章节
  398. chapterElements.forEach((chapterElement, index) => {
  399. const title = chapterElement.textContent.trim();
  400. let content = '';
  401. // 收集当前章节的所有段落,直到下一个 h2 或文档结束
  402. let currentElement = chapterElement.nextElementSibling;
  403. while (currentElement && currentElement.tagName.toLowerCase() !== 'h2') {
  404. if (currentElement.tagName.toLowerCase() === 'p') {
  405. // 每个 p 标签作为独立段落,用 <p> 标签包裹并添加样式类
  406. content += `<p>${currentElement.innerHTML}</p>\n`;
  407. }
  408. currentElement = currentElement.nextElementSibling;
  409. }
  410. chapters.push({ title, content });
  411. });
  412. // 读取封面图片
  413. const coverImage = fs.readFileSync(coverImagePath);
  414. // 创建 EPUB 容器
  415. const zip = new JSZip();
  416. // 添加 mimetype 文件(必须是第一个文件,且不压缩)
  417. zip.file('mimetype', 'application/epub+zip', { compression: 'STORE' });
  418. // 创建 META-INF 目录
  419. const metaInf = zip.folder('META-INF');
  420. metaInf.file('container.xml', `<?xml version="1.0"?>
  421. <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
  422. <rootfiles>
  423. <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
  424. </rootfiles>
  425. </container>`);
  426. // 创建 OEBPS 目录
  427. const oebps = zip.folder('OEBPS');
  428. // 添加封面图片
  429. oebps.file('Images/cover.jpg', coverImage);
  430. // 生成封面页 XHTML
  431. const coverXhtml = `<?xml version="1.0" encoding="UTF-8"?>
  432. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  433. <html xmlns="http://www.w3.org/1999/xhtml">
  434. <head>
  435. <title>封面</title>
  436. <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  437. <meta name="calibre:cover" content="true"/>
  438. <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
  439. <style type="text/css">
  440. body {
  441. margin: 0;
  442. padding: 0;
  443. text-align: center;
  444. }
  445. img {
  446. max-width: 100%;
  447. height: auto;
  448. margin: 0;
  449. padding: 0;
  450. }
  451. </style>
  452. </head>
  453. <body>
  454. <div class="cover-container">
  455. <img src="../Images/cover.jpg" alt="封面"/>
  456. </div>
  457. </body>
  458. </html>`;
  459. const textFolder = oebps.folder('Text');
  460. textFolder.file('cover.xhtml', coverXhtml);
  461. // 生成章节 HTML 文件
  462. const spineItems = [
  463. { idref: 'cover', linear: 'no' }
  464. ];
  465. const manifestItems = [
  466. { id: 'cover', href: 'Text/cover.xhtml', mediaType: 'application/xhtml+xml', properties: 'cover-image' },
  467. { id: 'cover-image', href: 'Images/cover.jpg', mediaType: 'image/jpeg' },
  468. { id: 'ncx', href: 'toc.ncx', mediaType: 'application/x-dtbncx+xml' }
  469. ];
  470. chapters.forEach((chapter, index) => {
  471. const { title, content } = chapter;
  472. const chapterId = `chapter_${index}`;
  473. const chapterFileName = `${chapterId}.xhtml`;
  474. const chapterHtml = `<?xml version="1.0" encoding="UTF-8"?>
  475. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  476. <html xmlns="http://www.w3.org/1999/xhtml">
  477. <head>
  478. <title>${title}</title>
  479. <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  480. <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
  481. </head>
  482. <body>
  483. <h2 class="chapter-title">${title}</h2>
  484. <div class="chapter-content">
  485. ${content}
  486. </div>
  487. </body>
  488. </html>`;
  489. textFolder.file(chapterFileName, chapterHtml);
  490. manifestItems.push({ id: chapterId, href: `Text/${chapterFileName}`, mediaType: 'application/xhtml+xml' });
  491. spineItems.push({ idref: chapterId });
  492. });
  493. // 生成目录文件
  494. const tocHtml = `<?xml version="1.0" encoding="UTF-8"?>
  495. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  496. <html xmlns="http://www.w3.org/1999/xhtml">
  497. <head>
  498. <title>目录</title>
  499. <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  500. <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
  501. </head>
  502. <body>
  503. <h2 class="toc-title">目录</h2>
  504. <ol class="toc-list">
  505. ${chapters.map((chapter, index) => {
  506. return `<li class="toc-item"><a href="chapter_${index}.xhtml">${chapter.title}</a></li>`;
  507. }).join('\n')}
  508. </ol>
  509. </body>
  510. </html>`;
  511. textFolder.file('toc.xhtml', tocHtml);
  512. manifestItems.push({ id: 'toc', href: 'Text/toc.xhtml', mediaType: 'application/xhtml+xml' });
  513. // 添加 CSS 文件
  514. const cssFolder = oebps.folder('Styles');
  515. const csspath=path.join(__dirname, 'epub_styles.css');
  516. console.log("🚀 ~ generateEpub ~ csspath:", csspath)
  517. const cssContent = fs.readFileSync(csspath, 'utf-8');
  518. cssFolder.file('stylesheet.css', cssContent);
  519. manifestItems.push({ id: 'stylesheet', href: 'Styles/stylesheet.css', mediaType: 'text/css' });
  520. // 生成唯一标识符
  521. const bookUUID = uuidv4();
  522. // 生成 toc.ncx 文件
  523. const tocNcx = `<?xml version='1.0' encoding='utf-8'?>
  524. <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="zh-CN">
  525. <head>
  526. <meta content="${bookUUID}" name="dtb:uid"/>
  527. <meta content="2" name="dtb:depth"/>
  528. <meta content="0" name="dtb:totalPageCount"/>
  529. <meta content="0" name="dtb:maxPageNumber"/>
  530. </head>
  531. <docTitle>
  532. <text>${bookTitle}</text>
  533. </docTitle>
  534. <navMap>
  535. <navPoint id="navpoint-0" playOrder="0">
  536. <navLabel>
  537. <text>封面</text>
  538. </navLabel>
  539. <content src="Text/cover.xhtml"/>
  540. </navPoint>
  541. <navPoint id="navpoint-1" playOrder="1">
  542. <navLabel>
  543. <text>目录</text>
  544. </navLabel>
  545. <content src="Text/toc.xhtml"/>
  546. </navPoint>
  547. ${chapters.map((chapter, index) => {
  548. return `<navPoint id="navpoint-${index + 2}" playOrder="${index + 2}">
  549. <navLabel>
  550. <text>${chapter.title}</text>
  551. </navLabel>
  552. <content src="Text/chapter_${index}.xhtml"/>
  553. </navPoint>`;
  554. }).join('\n')}
  555. </navMap>
  556. </ncx>`;
  557. oebps.file('toc.ncx', tocNcx);
  558. // 生成 content.opf 文件
  559. const contentOpf = `<?xml version="1.0" encoding="UTF-8"?>
  560. <package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="uuid_id">
  561. <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
  562. <dc:identifier id="uuid_id">urn:uuid:${bookUUID}</dc:identifier>
  563. <dc:title>${bookTitle}</dc:title>
  564. <dc:creator>${author}</dc:creator>
  565. <dc:language>zh-CN</dc:language>
  566. <dc:date>${new Date().toISOString().split('T')[0]}</dc:date>
  567. <meta name="cover" content="cover-image"/>
  568. </metadata>
  569. <manifest>
  570. ${manifestItems.map(item => `<item id="${item.id}" href="${item.href}" media-type="${item.mediaType}"/>`).join('\n')}
  571. </manifest>
  572. <spine toc="ncx">
  573. <itemref idref="cover"/>
  574. <itemref idref="toc"/>
  575. ${spineItems.map(item => `<itemref idref="${item.idref}"/>`).join('\n')}
  576. </spine>
  577. <guide>
  578. <reference type="cover" title="封面" href="Text/cover.xhtml"/>
  579. </guide>
  580. </package>`;
  581. oebps.file('content.opf', contentOpf);
  582. // 生成 EPUB 文件
  583. const epubContent = await zip.generateAsync({
  584. type: 'nodebuffer',
  585. compression: 'DEFLATE',
  586. mimeType: 'application/epub+zip'
  587. });
  588. fs.writeFileSync(outputPath, epubContent);
  589. console.log(`EPUB 电子书已生成: ${outputPath}`);
  590. } catch (error) {
  591. console.error('生成 EPUB 电子书时出错:', error);
  592. }
  593. }
  594. /**
  595. * 命令行入口
  596. */
  597. //const title = "Zhui-Xu";
  598. const title="Throne-of-Magical-Arcana";
  599. //const title = 'Release-that-Witch';
  600. //const title = 'Strange-Life-of-a-Cat';
  601. //const title = "Hidden-Assassin";
  602. //const author = "Angry Banana";
  603. const author = "Cuttlefish That Loves Diving";
  604. const coverName = "cover.jpg";
  605. // 从命令行参数获取小说标题
  606. const args = process.argv.slice(2);
  607. // 显示帮助信息
  608. if (args.includes("--help") || args.includes("-h")) {
  609. showHelp();
  610. process.exit(0);
  611. }
  612. // 获取参数
  613. const debug = args.includes("--debug");
  614. const outputDir = args.find(arg => arg.startsWith("--output="))?.split("=")[1] || title;
  615. const fetchContent = 1;
  616. console.log(`开始爬取小说: ${title}`);
  617. console.log(`调试模式: ${debug ? "开启" : "关闭"}`);
  618. console.log(`输出目录: ${outputDir}`);
  619. console.log(`爬取章节内容: ${fetchContent ? "是" : "否"}`);
  620. // 确保输出目录存在
  621. if (!fs.existsSync(outputDir)) {
  622. fs.mkdirSync(outputDir, { recursive: true });
  623. }
  624. if (1==1){
  625. await runScript(title, debug, outputDir, fetchContent)
  626. .then(result => {
  627. if (result && result.errcode) {
  628. console.error(`爬取失败: ${result.errStr}`);
  629. process.exit(1);
  630. } else {
  631. console.log("爬取任务完成!");
  632. }
  633. })
  634. .catch(err => {
  635. console.error("程序执行出错:", err);
  636. process.exit(1);
  637. });
  638. }
  639. let sourceDir = path.join(__dirname, title+'/'+title+'_contents');
  640. let outputFile = path.join(__dirname, title+'/'+title+'.txt');
  641. let outputFile2 = path.join(__dirname, title+'/'+title+'.html');
  642. let coverFile=path.join(__dirname, title+'/'+coverName);
  643. let epubFile=path.join(__dirname, title+'/'+title+'.epub');
  644. if (1==1){
  645. await mergeChapterFiles(sourceDir, outputFile)
  646. .then(() => console.log('合并操作完成'))
  647. .catch(err => console.error('合并操作失败:', err));
  648. await formatTextFile(outputFile, outputFile2);
  649. }
  650. await generateEpub(outputFile2, coverFile, epubFile, title, author);
  651. // 导出函数,以便其他模块使用
  652. export default runScript;