#!/usr/bin/env node /** * Playwright 浏览器工具集 * 提供便捷的网页访问、截图、内容提取等功能 */ import { chromium } from 'playwright'; import fs from 'fs'; import path from 'path'; /** * 访问网页并获取页面内容 * @param {string} url - 要访问的URL * @param {Object} options - 配置选项 * @returns {Object} 页面内容和截图 */ export async function visitPage(url, options = {}) { const { headless = true, waitUntil = 'domcontentloaded', timeout = 30000, screenshot = true, screenshotPath = null, fullPage = false } = options; const browser = await chromium.launch({ headless }); const context = await browser.newContext({ userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' }); const page = await context.newPage(); try { console.log(`🌐 正在访问: ${url}`); await page.goto(url, { waitUntil, timeout }); // 获取页面标题 const title = await page.title(); console.log(`📄 页面标题: ${title}`); // 获取页面内容 const content = await page.content(); // 获取页面文本(简化版) const text = await page.evaluate(() => { return document.body.innerText; }); let screenshotBuffer = null; if (screenshot) { screenshotBuffer = await page.screenshot({ fullPage }); if (screenshotPath) { fs.writeFileSync(screenshotPath, screenshotBuffer); console.log(`📸 截图已保存: ${screenshotPath}`); } } return { success: true, url, title, content, text, screenshot: screenshotBuffer }; } catch (error) { console.error(`❌ 访问失败: ${error.message}`); return { success: false, url, error: error.message }; } finally { await browser.close(); } } /** * 截取网页截图 * @param {string} url - 网页URL * @param {string} outputPath - 输出路径 * @param {Object} options - 配置选项 */ export async function screenshot(url, outputPath, options = {}) { const result = await visitPage(url, { ...options, screenshot: true, screenshotPath: outputPath }); return result; } /** * 提取网页文本内容 * @param {string} url - 网页URL * @param {Object} options - 配置选项 */ export async function extractText(url, options = {}) { const result = await visitPage(url, { ...options, screenshot: false }); return result; } /** * 在页面上执行自定义操作 * @param {string} url - 网页URL * @param {Function} action - 自定义操作函数 * @param {Object} options - 配置选项 */ export async function customAction(url, action, options = {}) { const { headless = true, waitUntil = 'domcontentloaded', timeout = 30000 } = options; const browser = await chromium.launch({ headless }); const context = await browser.newContext({ userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' }); const page = await context.newPage(); try { console.log(`🌐 正在访问: ${url}`); await page.goto(url, { waitUntil, timeout }); // 执行自定义操作 const result = await action(page); return { success: true, url, result }; } catch (error) { console.error(`❌ 操作失败: ${error.message}`); return { success: false, url, error: error.message }; } finally { await browser.close(); } } // 如果直接运行此脚本 if (import.meta.url === `file://${process.argv[1]}`) { // 示例:访问一个网页 const url = process.argv[2] || 'https://www.baidu.com'; const outputDir = path.join(process.cwd(), 'screenshots'); if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); const screenshotPath = path.join(outputDir, `screenshot-${timestamp}.png`); visitPage(url, { screenshotPath }) .then(result => { if (result.success) { console.log('\n✅ 访问成功!'); console.log(`标题: ${result.title}`); console.log(`文本长度: ${result.text.length} 字符`); } else { console.log('\n❌ 访问失败'); console.log(`错误: ${result.error}`); } process.exit(result.success ? 0 : 1); }) .catch(error => { console.error('Error:', error); process.exit(1); }); } export default { visitPage, screenshot, extractText, customAction };