import fs from 'fs';
import axios from 'axios';
import * as cheerio from 'cheerio';
import path from 'path';
import { setTimeout } from 'timers/promises';
import { fileURLToPath } from 'url';
import JSZip from 'jszip';
import { v4 as uuidv4 } from 'uuid';
import { JSDOM } from 'jsdom';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
/**
* 格式化章节内容
* @param {string} content - 原始文本内容
* @returns {string} - 格式化后的HTML内容
*/
async function formatChapterContent(content) {
// 将章节标题替换为
content = content.replace(/Chapter \d+/g, match => `${match}
`);
// 将正文段落用
包裹
content = content.replace(/\n\n/g, '
');
content = `
${content}
`;
return content;
}
/**
* 处理文本文件,为章节标题添加h2标签,为段落添加p标签
* @param {string} filePath - 文本文件路径
* @param {string} outputPath - 输出文件路径
* @returns {Promise}
*/
async function formatTextFile(filePath, outputPath) {
try {
// 读取文本文件
const content = await fs.promises.readFile(filePath, 'utf-8');
console.log(`已读取文件: ${filePath}`);
// 分割成章节
const chapters = content.split(/Chapter \d+/).filter(Boolean);
console.log(`检测到 ${chapters.length} 个章节内容块`);
let formattedContent = '';
let chapterIndex = 1;
// 处理每个章节
for (const chapter of chapters) {
// 添加章节标题
formattedContent += `Chapter ${chapterIndex}
\n`;
// 处理章节内容,将段落用标签包裹
const paragraphs = chapter.trim().split(/\n\s*\n/);
for (const paragraph of paragraphs) {
if (paragraph.trim()) {
formattedContent += `
${paragraph.trim()}
\n`;
}
}
chapterIndex++;
}
// 写入输出文件
await fs.promises.writeFile(outputPath, formattedContent);
console.log(`格式化完成! 结果已保存到: ${outputPath}`);
console.log(`共处理了 ${chapters.length} 个章节`);
} catch (error) {
console.error('处理文本文件时出错:', error);
}
}
async function mergeChapterFiles(sourceDir, outputFile) {
try {
const SEPARATOR = '\n******************\n';
const { readdir, readFile, writeFile } = fs.promises;
// 获取所有HTML文件并按章节顺序排序
const files = (await readdir(sourceDir))
.filter(file => file.endsWith('.html'))
.sort((a, b) => parseInt(a.split('_')[0]) - parseInt(b.split('_')[0]));
if (files.length === 0) {
console.error('未找到任何章节文件');
return;
}
let mergedContent = '';
// 处理每个章节文件
for (const file of files) {
const filePath = path.join(sourceDir, file);
const html = await readFile(filePath, 'utf-8');
// 提取章节标题
const titleMatch = file.match(/_([^\.]+)\.html$/);
const title = titleMatch ? titleMatch[1] : file;
// 提取正文内容
const contentMatch = html.match(/]*>([\s\S]*?)<\/div>/);
if (!contentMatch) continue;
let content = contentMatch[1]
.replace(/
]*>/g, '')
.replace(/<\/sent>/g, '')
.replace(/
/g, '\n')
.replace(/<[^>]+>/g, '')
.replace(/\(adsbygoogle\s*=\s*window\.adsbygoogle\s*\|\|\s*\[\]\).push\(\{\}\);/g, '')
.replace(/\n{3,}/g, '\n\n');
// 添加到合并内容
mergedContent += `${title}\n\n${content.trim()}${SEPARATOR}`;
console.log(`已处理: ${file}`);
}
// 格式化合并后的内容
const formattedContent = mergedContent;
// 写入合并文件
await writeFile(outputFile, formattedContent);
console.log(`\n合并完成! 结果已保存到: ${outputFile}`);
console.log(`共合并了 ${files.length} 个章节`);
} catch (error) {
console.error('合并章节时出错:', error);
}
}
/**
* 爬取单个章节内容
* @param {string} baseUrl - 网站基础URL
* @param {string} chapterUrl - 章节URL
* @param {object} headers - 请求头
* @param {boolean} debug - 是否开启调试模式
* @returns {Promise} - 章节内容或null
*/
async function fetchChapterContent(baseUrl, chapterUrl, headers, debug = false) {
try {
// 如果URL不是以http开头,则添加baseUrl
const fullUrl = chapterUrl.startsWith('http') ? chapterUrl : `${baseUrl}${chapterUrl}`;
if (debug) {
console.log(`爬取章节内容: ${fullUrl}`);
}
const response = await axios.get(fullUrl, { headers });
if (response.status !== 200) {
console.error(`获取章节内容失败,状态码: ${response.status}`);
return null;
}
const $ = cheerio.load(response.data);
// 尝试多种选择器来获取章节内容
const contentSelectors = [
'.chapter-content',
'.article-content',
'.content',
'#content',
'.text-content',
'.chapter-text',
'.novel-content'
];
let content = null;
for (const selector of contentSelectors) {
const element = $(selector);
if (element.length > 0) {
content = element.html();
if (debug) {
console.log(`使用选择器 ${selector} 成功获取章节内容`);
}
break;
}
}
// 如果上面的选择器都没找到内容,尝试查找包含大量文本的元素
if (!content) {
let maxTextLength = 0;
let maxTextElement = null;
$('div, article, section, p').each((_, element) => {
const text = $(element).text().trim();
if (text.length > maxTextLength && text.length > 500) {
maxTextLength = text.length;
maxTextElement = element;
}
});
if (maxTextElement) {
content = $(maxTextElement).html();
if (debug) {
console.log(`使用最长文本元素获取章节内容,长度: ${maxTextLength}`);
}
}
}
return content;
} catch (error) {
console.error(`爬取章节内容出错: ${error.message}`);
return null;
}
}
/**
* 爬取小说章节目录
* @param {string} title - 小说标题
* @param {boolean} debug - 是否开启调试模式
* @param {string} outputDir - 输出目录
* @param {boolean} fetchContent - 是否爬取章节内容
* @returns {Promise} - 章节目录数组或错误对象
*/
async function runScript(title, debug = false, outputDir = 'src/web_crawler', fetchContent = false){
const url = `https://novelhi.com/s/index/`+title;
console.log(`正在爬取网址: ${url}`);
try {
// 设置请求头,模拟浏览器行为
const headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
};
const response = await axios.get(url, { headers });
console.log(`请求状态码: ${response.status}`);
// 如果开启调试模式,保存HTML内容
if (debug) {
const debugFilePath = path.join(outputDir, 'debug_page.html');
fs.writeFileSync(debugFilePath, response.data);
console.log(`已保存HTML内容到 ${debugFilePath} 文件`);
}
const $ = cheerio.load(response.data);
// 提取章节目录
const chapters = [];
// 如果开启调试模式,打印页面结构信息
if (debug) {
console.log('页面标题:', $('title').text());
console.log('页面主要容器数量:');
console.log('- div 元素数量:', $('div').length);
console.log('- ul 元素数量:', $('ul').length);
console.log('- li 元素数量:', $('li').length);
console.log('- a 元素数量:', $('a').length);
}
// 尝试多种选择器
const selectors = [
'.chapter-list li',
'.book-catalog-list a',
'.catalog-list li',
'.chapter-item',
'.chapter a',
'ul.chapters li',
'.book-chapters a',
'.novel-chapters a',
'span:contains("Chapter")'
];
for (const selector of selectors) {
console.log(`尝试选择器: ${selector}`);
const elements = $(selector);
console.log(`找到 ${elements.length} 个元素`);
if (elements.length > 0) {
elements.each((index, element) => {
let chapterTitle, chapterUrl;
if (selector === 'span:contains("Chapter")') {
chapterTitle = $(element).text().trim();
// 对于这个网站,我们可能需要构造章节URL
chapterUrl = `/s/${title}/${chapterTitle.replace('Chapter ', '')}`;
} else if (selector.includes('a')) {
chapterTitle = $(element).text().trim();
chapterUrl = $(element).attr('href');
} else {
chapterTitle = $(element).find('a').text().trim();
chapterUrl = $(element).find('a').attr('href');
}
if (chapterTitle) {
chapters.push({
title: chapterTitle,
url: chapterUrl || '#',
index: index + 1
});
}
});
if (chapters.length > 0) {
console.log(`使用选择器 ${selector} 成功找到章节`);
break;
}
}
}
// 如果上面的选择器都没有找到章节,尝试更通用的方法
if (chapters.length === 0) {
console.log('尝试查找所有可能的章节链接...');
// 查找所有包含"chapter"或"第"字样的链接或文本
$('*').each((index, element) => {
const text = $(element).text().trim();
if (text && (text.includes('Chapter') || text.includes('第') || text.includes('章'))) {
// 检查是否是单独的章节标题(不包含其他章节)
if (text.match(/^Chapter \d+$/) || text.match(/^第[一二三四五六七八九十百千万]+章/) || text.match(/^\d+\.\s+.+$/)) {
chapters.push({
title: text,
url: '#', // 如果没有URL,使用占位符
index: index + 1
});
}
}
});
}
console.log(`共找到 ${chapters.length} 个章节`);
// 将结果保存到文件中
const outputFilePath = path.join(outputDir, `${title}_chapters.json`);
fs.writeFileSync(outputFilePath, JSON.stringify(chapters, null, 2));
console.log(`已将章节目录保存到 ${outputFilePath} 文件`);
// 打印前10个章节和后10个章节
if (debug || chapters.length <= 20) {
console.log("章节目录:");
console.log(JSON.stringify(chapters, null, 2));
} else {
console.log("前10个章节:");
console.log(JSON.stringify(chapters.slice(0, 10), null, 2));
console.log("...");
console.log("后10个章节:");
console.log(JSON.stringify(chapters.slice(-10), null, 2));
}
// 如果需要爬取章节内容
if (fetchContent && chapters.length > 0) {
console.log(`开始爬取章节内容...`);
// 创建章节内容目录
const contentDir = path.join(outputDir, `${title}_contents`);
if (!fs.existsSync(contentDir)) {
fs.mkdirSync(contentDir, { recursive: true });
}
// 创建一个包含所有章节内容的对象
const contentsObj = {};
// 设置基础URL
const baseUrl = 'https://novelhi.com';
// 爬取所有章节
const chaptersToFetch = chapters;
console.log(`将爬取 ${chaptersToFetch.length}/${chapters.length} 个章节的内容`);
// 爬取章节内容
for (let i = 0; i < chaptersToFetch.length; i++) {
const chapter = chaptersToFetch[i];
// 每爬取10个章节,暂停一下,避免请求过于频繁
if (i > 0 && i % 10 === 0) {
console.log(`已爬取 ${i}/${chaptersToFetch.length} 章节,暂停 2 秒...`);
await setTimeout(2000);
}
// 爬取章节内容
const content = await fetchChapterContent(baseUrl, chapter.url, headers, debug);
if (content) {
// 更新章节对象,添加内容
chapters[i].content = content;
contentsObj[chapter.title] = content;
// 将章节内容保存到单独的文件
const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
const chapterFilePath = path.join(contentDir, chapterFileName);
// 创建一个完整的HTML文件
const htmlContent = `
${chapter.title}
${chapter.title}
${content}
`;
fs.writeFileSync(chapterFilePath, htmlContent);
if (debug) {
console.log(`已保存章节 ${chapter.title} 到 ${chapterFilePath}`);
} else if (i % 10 === 0 || i === chapters.length - 1) {
console.log(`已保存 ${i + 1}/${chapters.length} 章节`);
}
} else {
console.error(`获取章节 ${chapter.title} 内容失败`);
}
}
// 保存所有章节内容到一个文件
const allContentsPath = path.join(outputDir, `${title}_all_contents.json`);
fs.writeFileSync(allContentsPath, JSON.stringify(contentsObj, null, 2));
console.log(`已将所有章节内容保存到 ${allContentsPath} 文件`);
// 更新章节目录文件,包含内容
const chaptersWithContentPath = path.join(outputDir, `${title}_chapters_with_content.json`);
fs.writeFileSync(chaptersWithContentPath, JSON.stringify(chapters, null, 2));
console.log(`已将包含内容的章节目录保存到 ${chaptersWithContentPath} 文件`);
}
return chapters;
} catch (err) {
console.error("爬取过程中出错:", err.message);
return { errcode: 101, errStr: err.message };
} finally {
console.log("完成");
}
}
/**
* 生成 EPUB 电子书
* @param {string} contentFilePath - HTML 格式的正文文件路径
* @param {string} coverImagePath - 封面图片路径
* @param {string} outputPath - 输出 EPUB 文件路径
* @param {string} bookTitle - 电子书标题
* @param {string} author - 作者名称
* @returns {Promise}
*/
async function generateEpub(contentFilePath, coverImagePath, outputPath, bookTitle, author) {
try {
// 读取 HTML 内容
const htmlContent = fs.readFileSync(contentFilePath, 'utf-8');
const dom = new JSDOM(htmlContent);
const document = dom.window.document;
// 提取章节 (h2 标签)
const chapterElements = document.querySelectorAll('h2');
const chapters = [];
// 处理每个章节
chapterElements.forEach((chapterElement, index) => {
const title = chapterElement.textContent.trim();
let content = '';
// 收集当前章节的所有段落,直到下一个 h2 或文档结束
let currentElement = chapterElement.nextElementSibling;
while (currentElement && currentElement.tagName.toLowerCase() !== 'h2') {
if (currentElement.tagName.toLowerCase() === 'p') {
// 每个 p 标签作为独立段落,用 标签包裹并添加样式类
content += `
${currentElement.innerHTML}
\n`;
}
currentElement = currentElement.nextElementSibling;
}
chapters.push({ title, content });
});
// 读取封面图片
const coverImage = fs.readFileSync(coverImagePath);
// 创建 EPUB 容器
const zip = new JSZip();
// 添加 mimetype 文件(必须是第一个文件,且不压缩)
zip.file('mimetype', 'application/epub+zip', { compression: 'STORE' });
// 创建 META-INF 目录
const metaInf = zip.folder('META-INF');
metaInf.file('container.xml', `
`);
// 创建 OEBPS 目录
const oebps = zip.folder('OEBPS');
// 添加封面图片
oebps.file('Images/cover.jpg', coverImage);
// 生成封面页 XHTML
const coverXhtml = `
封面
`;
const textFolder = oebps.folder('Text');
textFolder.file('cover.xhtml', coverXhtml);
// 生成章节 HTML 文件
const spineItems = [
{ idref: 'cover', linear: 'no' }
];
const manifestItems = [
{ id: 'cover', href: 'Text/cover.xhtml', mediaType: 'application/xhtml+xml', properties: 'cover-image' },
{ id: 'cover-image', href: 'Images/cover.jpg', mediaType: 'image/jpeg' },
{ id: 'ncx', href: 'toc.ncx', mediaType: 'application/x-dtbncx+xml' }
];
chapters.forEach((chapter, index) => {
const { title, content } = chapter;
const chapterId = `chapter_${index}`;
const chapterFileName = `${chapterId}.xhtml`;
const chapterHtml = `
${title}
${title}
${content}
`;
textFolder.file(chapterFileName, chapterHtml);
manifestItems.push({ id: chapterId, href: `Text/${chapterFileName}`, mediaType: 'application/xhtml+xml' });
spineItems.push({ idref: chapterId });
});
// 生成目录文件
const tocHtml = `
目录
目录
${chapters.map((chapter, index) => {
return `- ${chapter.title}
`;
}).join('\n')}
`;
textFolder.file('toc.xhtml', tocHtml);
manifestItems.push({ id: 'toc', href: 'Text/toc.xhtml', mediaType: 'application/xhtml+xml' });
// 添加 CSS 文件
const cssFolder = oebps.folder('Styles');
const csspath=path.join(__dirname, 'epub_styles.css');
console.log("🚀 ~ generateEpub ~ csspath:", csspath)
const cssContent = fs.readFileSync(csspath, 'utf-8');
cssFolder.file('stylesheet.css', cssContent);
manifestItems.push({ id: 'stylesheet', href: 'Styles/stylesheet.css', mediaType: 'text/css' });
// 生成唯一标识符
const bookUUID = uuidv4();
// 生成 toc.ncx 文件
const tocNcx = `
${bookTitle}
封面
目录
${chapters.map((chapter, index) => {
return `
${chapter.title}
`;
}).join('\n')}
`;
oebps.file('toc.ncx', tocNcx);
// 生成 content.opf 文件
const contentOpf = `
urn:uuid:${bookUUID}
${bookTitle}
${author}
zh-CN
${new Date().toISOString().split('T')[0]}
${manifestItems.map(item => ` `).join('\n')}
${spineItems.map(item => ``).join('\n')}
`;
oebps.file('content.opf', contentOpf);
// 生成 EPUB 文件
const epubContent = await zip.generateAsync({
type: 'nodebuffer',
compression: 'DEFLATE',
mimeType: 'application/epub+zip'
});
fs.writeFileSync(outputPath, epubContent);
console.log(`EPUB 电子书已生成: ${outputPath}`);
} catch (error) {
console.error('生成 EPUB 电子书时出错:', error);
}
}
/**
* 命令行入口
*/
//const title = "Zhui-Xu";
const title="Throne-of-Magical-Arcana";
//const title = 'Release-that-Witch';
//const title = 'Strange-Life-of-a-Cat';
//const title = "Hidden-Assassin";
//const author = "Angry Banana";
const author = "Cuttlefish That Loves Diving";
const coverName = "cover.jpg";
// 从命令行参数获取小说标题
const args = process.argv.slice(2);
// 显示帮助信息
if (args.includes("--help") || args.includes("-h")) {
showHelp();
process.exit(0);
}
// 获取参数
const debug = args.includes("--debug");
const outputDir = args.find(arg => arg.startsWith("--output="))?.split("=")[1] || title;
const fetchContent = 1;
console.log(`开始爬取小说: ${title}`);
console.log(`调试模式: ${debug ? "开启" : "关闭"}`);
console.log(`输出目录: ${outputDir}`);
console.log(`爬取章节内容: ${fetchContent ? "是" : "否"}`);
// 确保输出目录存在
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
if (1==1){
await runScript(title, debug, outputDir, fetchContent)
.then(result => {
if (result && result.errcode) {
console.error(`爬取失败: ${result.errStr}`);
process.exit(1);
} else {
console.log("爬取任务完成!");
}
})
.catch(err => {
console.error("程序执行出错:", err);
process.exit(1);
});
}
let sourceDir = path.join(__dirname, title+'/'+title+'_contents');
let outputFile = path.join(__dirname, title+'/'+title+'.txt');
let outputFile2 = path.join(__dirname, title+'/'+title+'.html');
let coverFile=path.join(__dirname, title+'/'+coverName);
let epubFile=path.join(__dirname, title+'/'+title+'.epub');
if (1==1){
await mergeChapterFiles(sourceDir, outputFile)
.then(() => console.log('合并操作完成'))
.catch(err => console.error('合并操作失败:', err));
await formatTextFile(outputFile, outputFile2);
}
await generateEpub(outputFile2, coverFile, epubFile, title, author);
// 导出函数,以便其他模块使用
export default runScript;