You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
125 lines
3.6 KiB
125 lines
3.6 KiB
#!/usr/bin/env node
|
|
import { chromium } from 'playwright';
|
|
import fs from 'fs';
|
|
import path from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = path.dirname(__filename);
|
|
|
|
async function summarize(url, options = {}) {
|
|
const { length = 'medium', output = 'text' } = options;
|
|
|
|
console.log(`✨ Summarizing: ${url}`);
|
|
console.log(`📏 Length: ${length}`);
|
|
|
|
const browser = await chromium.launch({
|
|
headless: true,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
await page.goto(url, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 30000
|
|
});
|
|
|
|
// 提取页面内容
|
|
const content = await page.evaluate(() => {
|
|
// 移除干扰元素
|
|
const selectors = 'script, style, nav, footer, aside, .ad, .advertisement, .popup, .cookie-banner';
|
|
document.querySelectorAll(selectors).forEach(el => el.remove());
|
|
|
|
// 提取内容
|
|
const title = document.title;
|
|
const text = document.body.innerText
|
|
.replace(/\s+/g, ' ')
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.trim();
|
|
|
|
// 提取所有链接
|
|
const links = Array.from(document.querySelectorAll('a[href]'))
|
|
.map(a => ({ text: a.textContent.trim(), href: a.href }))
|
|
.filter(link => link.text && link.href.startsWith('http'))
|
|
.slice(0, 10);
|
|
|
|
// 提取图片
|
|
const images = Array.from(document.querySelectorAll('img[src]'))
|
|
.map(img => ({ alt: img.alt.trim(), src: img.src }))
|
|
.filter(img => img.src.startsWith('http'))
|
|
.slice(0, 5);
|
|
|
|
return { title, text, links, images };
|
|
});
|
|
|
|
await browser.close();
|
|
|
|
// 根据长度决定摘要大小
|
|
const lengthMap = {
|
|
short: 300,
|
|
medium: 800,
|
|
long: 2000,
|
|
xl: 5000
|
|
};
|
|
|
|
const maxLength = lengthMap[length] || lengthMap.medium;
|
|
const summary = content.text.slice(0, maxLength) + (content.text.length > maxLength ? '...' : '');
|
|
|
|
// 输出结果
|
|
const result = {
|
|
url,
|
|
title: content.title,
|
|
summary,
|
|
stats: {
|
|
totalCharacters: content.text.length,
|
|
summaryLength: summary.length,
|
|
linksFound: content.links.length,
|
|
imagesFound: content.images.length
|
|
},
|
|
topLinks: content.links,
|
|
topImages: content.images
|
|
};
|
|
|
|
if (output === 'json') {
|
|
console.log(JSON.stringify(result, null, 2));
|
|
} else {
|
|
console.log(`\n📄 Title: ${result.title}`);
|
|
console.log(`\n📝 Summary:`);
|
|
console.log(result.summary);
|
|
console.log(`\n📊 Stats:`);
|
|
console.log(` - Total characters: ${result.stats.totalCharacters}`);
|
|
console.log(` - Summary length: ${result.stats.summaryLength}`);
|
|
console.log(` - Links found: ${result.stats.linksFound}`);
|
|
console.log(` - Images found: ${result.stats.imagesFound}`);
|
|
|
|
if (result.topLinks.length > 0) {
|
|
console.log(`\n🔗 Top links:`);
|
|
result.topLinks.forEach((link, i) => {
|
|
console.log(` ${i+1}. ${link.text} - ${link.href}`);
|
|
});
|
|
}
|
|
}
|
|
|
|
// 清理临时文件
|
|
const tmpItems = fs.readdirSync('/tmp').filter(f => f.startsWith('playwright_'));
|
|
tmpItems.forEach(item => {
|
|
const fullPath = path.join('/tmp', item);
|
|
const stat = fs.statSync(fullPath);
|
|
if (stat.isDirectory()) {
|
|
fs.rmSync(fullPath, { recursive: true, force: true });
|
|
} else {
|
|
fs.unlinkSync(fullPath);
|
|
}
|
|
});
|
|
|
|
return result;
|
|
}
|
|
|
|
// 如果直接运行这个文件,执行演示
|
|
if (process.argv[1] === __filename) {
|
|
const url = process.argv[2] || 'https://www.baidu.com';
|
|
const length = process.argv[3] || 'medium';
|
|
summarize(url, { length });
|
|
}
|
|
|
|
export default summarize;
|
|
|