前言
爬虫常见有两种方式
通过接口抓取要模拟接口的请求头和传参
通过网页爬取需要加载网页后解析网页
网页抓取
puppeteer(NodeJS)
使用无头浏览器模式抓取。
如果使用NodeJS推荐使用这个,比较好用,谷歌出品。
官方文档
https://pptr.nodejs.cn/
安装依赖
1
| npm install puppeteer@24.8.2 pdf-lib@1.17.1
|
其中
puppeteer 把HTML转为PDF
pdf-lib 把封面、目录、正文的PDF进行合并
安装慢可以使用
1 2
| npm install -g cnpm --registry=https://registry.npmmirror.com cnpm install puppeteer@24.8.2
|
示例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
| const puppeteer = require("puppeteer"); const fs = require("fs").promises;
(async () => { const browser = await puppeteer.launch({ headless: "new" }); const page = await browser.newPage(); await page.goto("http://localhost:5173/#/report_school", { waitUntil: "networkidle0", });
await page.waitForSelector(".page");
await new Promise((resolve) => setTimeout(resolve, 1000)); const headers = await page.evaluate(() => { const result = []; let pageNumber = 1;
document.querySelectorAll(".page").forEach((pageItem) => { pageItem.querySelectorAll("h1, h2, h3").forEach((header) => { const id = "header-" + Date.now() + "-" + result.length; header.id = id;
result.push({ text: header.textContent, level: header.tagName, id: id, page: pageNumber, }); });
pageNumber++; });
return result; }); console.info(headers); })();
|
HtmlUnit(Java)
安装依赖
1 2 3 4 5 6 7 8
| <dependencies> <dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.52.0</version> </dependency> </dependencies>
|
示例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
| import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.*; import com.gargoylesoftware.htmlunit.util.Cookie;
import java.util.logging.Level; import java.util.logging.Logger;
public class HtmlUnitLoginExample { public static void main(String[] args) throws Exception { Logger.getLogger("com.gargoylesoftware.htmlunit").setLevel(Level.OFF); try (WebClient webClient = new WebClient()) { webClient.getOptions().setJavaScriptEnabled(true); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getCookieManager().setCookiesEnabled(true); webClient.getOptions().setTimeout(10000); HtmlPage loginPage = webClient.getPage("https://github.com/login"); HtmlForm form = loginPage.getFormByName("login"); HtmlTextInput usernameInput = form.getInputByName("login"); usernameInput.setValueAttribute("your_username"); HtmlPasswordInput passwordInput = form.getInputByName("password"); passwordInput.setValueAttribute("your_password"); HtmlButton loginButton = form.getFirstByXPath("//input[@type='submit']"); HtmlPage resultPage = loginButton.click(); boolean isLoggedIn = resultPage.getTitleText().contains("GitHub"); if (isLoggedIn) { System.out.println("登录成功!"); for (Cookie cookie : webClient.getCookieManager().getCookies()) { System.out.println(cookie.getName() + " = " + cookie.getValue()); } HtmlPage profilePage = webClient.getPage("https://github.com/settings/profile"); System.out.println("个人资料页面标题: " + profilePage.getTitleText()); } else { System.out.println("登录失败!"); System.out.println("页面内容: " + resultPage.asNormalizedText()); } } } }
|
WebMagic
需要登录的使用这个不行。
https://webmagic.io/
添加依赖
1 2 3 4 5 6 7 8 9 10
| <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency>
|