SpringBoot+WebMagic+MyBaties实现爬虫和数据入库的示例

WebMagic是一个开源爬虫框架,本项目通过在SpringBoot项目中使用WebMagic去抓取数据,最后使用MyBatis将数据入库。

本项目代码地址:ArticleCrawler: SrpingBoot+WebMagic+MyBaties实现爬虫和数据入库 (gitee.com)

创建数据库:

本示例中库名为article,表名为cms_content,表中包含contentId、title、date三个字段。

CREATE TABLE `cms_content` (

`contentId` varchar(40) NOT NULL COMMENT '内容ID',

`title` varchar(150) NOT NULL COMMENT '标题',

`date` varchar(150) NOT NULL COMMENT '发布日期',

PRIMARY KEY (`contentId`)

) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='CMS内容表';

新建SpringBoot项目:

1、配置依赖pom.xml

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>

<parent>

<groupId>org.springframework.boot</groupId>

<artifactId>spring-boot-starter-parent</artifactId>

<version>2.5.5</version>

<relativePath/>

</parent>

<groupId>com.example</groupId>

<artifactId>Article</artifactId>

<version>0.0.1-SNAPSHOT</version>

<name>Article</name>

<description>Article</description>

<properties>

<java.version>1.8</java.version>

<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

<maven.test.skip>true</maven.test.skip>

<maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>

<maven.resources.plugin.version>3.1.0</maven.resources.plugin.version>

<mysql.connector.version>5.1.47</mysql.connector.version>

<druid.spring.boot.starter.version>1.1.17</druid.spring.boot.starter.version>

<mybatis.spring.boot.starter.version>1.3.4</mybatis.spring.boot.starter.version>

<fastjson.version>1.2.58</fastjson.version>

<commons.lang3.version>3.9</commons.lang3.version>

<joda.time.version>2.10.2</joda.time.version>

<webmagic.core.version>0.7.5</webmagic.core.version>

</properties>

<dependencies>

<dependency>

<groupId>org.springframework.boot</groupId>

<artifactId>spring-boot-starter-web</artifactId>

</dependency>

<dependency>

<groupId>org.springframework.boot</groupId>

<artifactId>spring-boot-starter-test</artifactId>

<scope>test</scope>

</dependency>

<dependency>

<groupId>org.springframework.boot</groupId>

<artifactId>spring-boot-configuration-processor</artifactId>

<optional>true</optional>

</dependency>

<dependency>

<groupId>mysql</groupId>

<artifactId>mysql-connector-java</artifactId>

<version>${mysql.connector.version}</version>

</dependency>

<dependency>

<groupId>com.alibaba</groupId>

<artifactId>druid-spring-boot-starter</artifactId>

<version>${druid.spring.boot.starter.version}</version>

</dependency>

<dependency>

<groupId>org.mybatis.spring.boot</groupId>

<artifactId>mybatis-spring-boot-starter</artifactId>

<version>${mybatis.spring.boot.starter.version}</version>

</dependency>

<dependency>

<groupId>com.alibaba</groupId>

<artifactId>fastjson</artifactId>

<version>${fastjson.version}</version>

</dependency>

<dependency>

<groupId>org.apache.commons</groupId>

<artifactId>commons-lang3</artifactId>

<version>${commons.lang3.version}</version>

</dependency>

<dependency>

<groupId>joda-time</groupId>

<artifactId>joda-time</artifactId>

<version>${joda.time.version}</version>

</dependency>

<dependency>

<groupId>us.codecraft</groupId>

<artifactId>webmagic-core</artifactId>

<version>${webmagic.core.version}</version>

<exclusions>

<exclusion>

<groupId>org.slf4j</groupId>

<artifactId>slf4j-log4j12</artifactId>

</exclusion>

</exclusions>

</dependency>

</dependencies>

<build>

<plugins>

<plugin>

<groupId>org.apache.maven.plugins</groupId>

<artifactId>maven-compiler-plugin</artifactId>

<version>${maven.compiler.plugin.version}</version>

<configuration>

<source>${java.version}</source>

<target>${java.version}</target>

<encoding>${project.build.sourceEncoding}</encoding>

</configuration>

</plugin>

<plugin>

<groupId>org.apache.maven.plugins</groupId>

<artifactId>maven-resources-plugin</artifactId>

<version>${maven.resources.plugin.version}</version>

<configuration>

<encoding>${project.build.sourceEncoding}</encoding>

</configuration>

</plugin>

<plugin>

<groupId>org.springframework.boot</groupId>

<artifactId>spring-boot-maven-plugin</artifactId>

<configuration>

<fork>true</fork>

<addResources>true</addResources>

</configuration>

<executions>

<execution>

<goals>

<goal>repackage</goal>

</goals>

</execution>

</executions>

</plugin>

</plugins>

</build>

<repositories>

<repository>

<id>public</id>

<name>aliyun nexus</name>

<url>http://maven.aliyun.com/nexus/content/groups/public/</url>

<releases>

<enabled>true</enabled>

</releases>

</repository>

</repositories>

<pluginRepositories>

<pluginRepository>

<id>public</id>

<name>aliyun nexus</name>

<url>http://maven.aliyun.com/nexus/content/groups/public/</url>

<releases>

<enabled>true</enabled>

</releases>

<snapshots>

<enabled>false</enabled>

</snapshots>

</pluginRepository>

</pluginRepositories>

</project>

2、创建CmsContentPO.java

数据实体,和表中3个字段对应。

package site.exciter.article.model;

public class CmsContentPO {

private String contentId;

private String title;

private String date;

public String getContentId() {

return contentId;

}

public void setContentId(String contentId) {

this.contentId = contentId;

}

public String getTitle() {

return title;

}

public void setTitle(String title) {

this.title = title;

}

public String getDate() {

return date;

}

public void setDate(String date) {

this.date = date;

}

}

3、创建CrawlerMapper.java

package site.exciter.article.dao;

import org.apache.ibatis.annotations.Mapper;

import site.exciter.article.model.CmsContentPO;

@Mapper

public interface CrawlerMapper {

int addCmsContent(CmsContentPO record);

}

4、配置映射文件CrawlerMapper.xml

在resources下新建mapper文件夹,在mapper下创建CrawlerMapper.xml

<?xml version="1.0" encoding="UTF-8"?>

<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">

<mapper namespace="site.exciter.article.dao.CrawlerMapper">

<insert id="addCmsContent" parameterType="site.exciter.article.model.CmsContentPO">

insert into cms_content (contentId,

title,

date)

values (#{contentId,jdbcType=VARCHAR},

#{title,jdbcType=VARCHAR},

#{date,jdbcType=VARCHAR})

</insert>

</mapper>

5、配置application.properties

配置数据库和mybatis映射关系。

# mysql

spring.datasource.name=mysql

spring.datasource.type=com.alibaba.druid.pool.DruidDataSource

spring.datasource.driver-class-name=com.mysql.jdbc.Driver

spring.datasource.url=jdbc:mysql://10.201.61.184:3306/article?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true

spring.datasource.username=root

spring.datasource.password=root

# druid

spring.datasource.druid.initial-size=5

spring.datasource.druid.min-idle=5

spring.datasource.druid.max-active=10

spring.datasource.druid.max-wait=60000

spring.datasource.druid.validation-query=SELECT 1 FROM DUAL

spring.datasource.druid.test-on-borrow=false

spring.datasource.druid.test-on-return=false

spring.datasource.druid.test-while-idle=true

spring.datasource.druid.time-between-eviction-runs-millis=60000

spring.datasource.druid.min-evictable-idle-time-millis=300000

spring.datasource.druid.max-evictable-idle-time-millis=600000

# mybatis

mybatis.mapperLocations=classpath:mapper/CrawlerMapper.xml

6、创建ArticlePageProcessor.java

解析html的逻辑。

package site.exciter.article;

import org.springframework.stereotype.Component;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.processor.PageProcessor;

import us.codecraft.webmagic.selector.Selectable;

@Component

public class ArticlePageProcessor implements PageProcessor {

private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

@Override

public void process(Page page) {

String detail_urls_Xpath = "//*[@class='postTitle']/a[@class='postTitle2']/@href";

String next_page_xpath = "//*[@id='nav_next_page']/a/@href";

String next_page_css = "#homepage_top_pager > div:nth-child(1) > a:nth-child(7)";

String title_xpath = "//h1[@class='postTitle']/a/span/text()";

String date_xpath = "//span[@id='post-date']/text()";

page.putField("title", page.getHtml().xpath(title_xpath).toString());

if (page.getResultItems().get("title") == null) {

page.setSkip(true);

}

page.putField("date", page.getHtml().xpath(date_xpath).toString());

if (page.getHtml().xpath(detail_urls_Xpath).match()) {

Selectable detailUrls = page.getHtml().xpath(detail_urls_Xpath);

page.addTargetRequests(detailUrls.all());

}

if (page.getHtml().xpath(next_page_xpath).match()) {

Selectable nextPageUrl = page.getHtml().xpath(next_page_xpath);

page.addTargetRequests(nextPageUrl.all());

} else if (page.getHtml().css(next_page_css).match()) {

Selectable nextPageUrl = page.getHtml().css(next_page_css).links();

page.addTargetRequests(nextPageUrl.all());

}

}

@Override

public Site getSite() {

return site;

}

}

7、创建ArticlePipeline.java

处理数据的持久化。

package site.exciter.article;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.stereotype.Component;

import site.exciter.article.model.CmsContentPO;

import site.exciter.article.dao.CrawlerMapper;

import us.codecraft.webmagic.ResultItems;

import us.codecraft.webmagic.Task;

import us.codecraft.webmagic.pipeline.Pipeline;

import java.util.UUID;

@Component

public class ArticlePipeline implements Pipeline {

private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline.class);

@Autowired

private CrawlerMapper crawlerMapper;

public void process(ResultItems resultItems, Task task) {

String title = resultItems.get("title");

String date = resultItems.get("date");

CmsContentPO contentPO = new CmsContentPO();

contentPO.setContentId(UUID.randomUUID().toString());

contentPO.setTitle(title);

contentPO.setDate(date);

try {

boolean success = crawlerMapper.addCmsContent(contentPO) > 0;

LOGGER.info("保存成功:{}", title);

} catch (Exception ex) {

LOGGER.error("保存失败", ex);

}

}

}

8、创建ArticleTask.java

执行抓取任务。

package site.exciter.article;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.stereotype.Component;

import us.codecraft.webmagic.Spider;

import java.util.concurrent.Executors;

import java.util.concurrent.ScheduledExecutorService;

import java.util.concurrent.TimeUnit;

@Component

public class ArticleTask {

private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline.class);

@Autowired

private ArticlePipeline articlePipeline;

@Autowired

private ArticlePageProcessor articlePageProcessor;

private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();

public void crawl() {

// 定时任务,每10分钟爬取一次

timer.scheduleWithFixedDelay(() -> {

Thread.currentThread().setName("ArticleCrawlerThread");

try {

Spider.create(articlePageProcessor)

.addUrl("http://www.cnblogs.com/dick159/default.html?page=2")

// 抓取到的数据存数据库

.addPipeline(articlePipeline)

// 开启5个线程抓取

.thread(5)

// 异步启动爬虫

.start();

} catch (Exception ex) {

LOGGER.error("定时抓取数据线程执行异常", ex);

}

}, 0, 10, TimeUnit.MINUTES);

}

}

9、修改Application

package site.exciter.article;

import org.mybatis.spring.annotation.MapperScan;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.boot.CommandLineRunner;

import org.springframework.boot.SpringApplication;

import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication

@MapperScan(basePackages = "site.exciter.article.interface")

public class ArticleApplication implements CommandLineRunner {

@Autowired

private ArticleTask articleTask;

public static void main(String[] args) {

SpringApplication.run(ArticleApplication.class, args);

}

@Override

public void run(String... args) throws Exception {

articleTask.crawl();

}

}

10、执行application,开始抓数据并入库

到此这篇关于SrpingBoot+WebMagic+MyBaties实现爬虫和数据入库的示例的文章就介绍到这了,更多相关SrpingBoot+WebMagic+MyBaties爬虫和数据入库内容请搜索以前的文章或继续浏览下面的相关文章希望大家以后多多支持!

以上是 SpringBoot+WebMagic+MyBaties实现爬虫和数据入库的示例 的全部内容, 来源链接: utcz.com/p/249828.html

回到顶部