1.创建maven工程并添加依赖
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!--webMagic jar-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!--webMagic 对布隆过滤器的支持-->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>16.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
</dependencies>
2.创建application.yml文件
spring:
datasource:
url: jdbc:mysql://localhost:3306/test01
username: root
password: root
driver-class-name: com.mysql.jdbc.Driver
jpa:
database: MySQL
show-sql: true
3.创建数据库表
CREATE TABLE job_info (
id int PRIMARY KEY AUTO_INCREMENT,
company_name varchar(256),
company_addr varchar(256),
company_info varchar(500),
job_name varchar(256),
job_info varchar(500),
job_addr varchar(256),
url varchar(256),
time varchar(128),
salary_min int,
salary_max int
);
4.创建启动类
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication
@EnableScheduling //定时任务注解
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class);
}
}
5.创建数据库表对应映射
5.1创建pojo
import lombok.Data;
import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
@Entity
@Data
public class JobInfo {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
private String companyName;
private String companyAddr;
private String companyInfo;
private String jobName;
private String jobInfo;
private String jobAddr;
private String url;
private String time;
private Integer salaryMin;
private Integer salaryMax;
@Override
public String toString() {
return "JobInfo{" +
"id=" + id +
", companyName='" + companyName + '\'' +
", companyAddr='" + companyAddr + '\'' +
", companyInfo='" + companyInfo + '\'' +
", jobName='" + jobName + '\'' +
", jobInfo='" + jobInfo + '\'' +
", url='" + url + '\'' +
", time='" + time + '\'' +
", salaryMin=" + salaryMin +
", salaryMax=" + salaryMax +
'}';
}
}
5.2创建dao
import org.springframework.data.jpa.repository.JpaRepository;
public interface JobInfoDao extends JpaRepository<JobInfo,Long> {
}
6.创建操作实体类的Service
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import java.util.List;
@Service
public class JobInfoService {
@Autowired
private JobInfoDao infoDao;
public void save(JobInfo jobInfo){
//根据url 和 发布时间查询数据
JobInfo jobInfo1 = new JobInfo();
jobInfo1.setUrl(jobInfo.getUrl());
jobInfo1.setTime(jobInfo.getTime());
//判断查询结果是否为空
List<JobInfo> list = this.findJobInfo(jobInfo1);
if (list.size() == 0) {
//如果查询结果为空,表示招聘信息数据不存在,或者已经更新了,需要新增或者更新数据库
this.infoDao.saveAndFlush(jobInfo);
}
}
private List<JobInfo> findJobInfo(JobInfo jobInfo) {
//设置查询条件
Example<JobInfo> example = Example.of(jobInfo);
//执行查询
List<JobInfo> list = this.infoDao.findAll(example);
return list;
}
}
7.创建webMagic任务类
import org.jsoup.Jsoup;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
@Component
public class JobTask implements PageProcessor {
private String url = "https://search.51job.com/list/020000,000000,0000,00,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
@Override
public void process(Page page) {
//解析页面,获取招聘信息详情的url地址
List<Selectable> list = page.getHtml().css("div#resultList div.el").nodes();
//判断获取到集合是否为空
if (list.size() == 0) {
//如果为空,表示这是招聘的详情页,解析页面,获取招聘详情信息,保存数据
this.saveJobInfo(page);
}else {
//如果不为空,表示这是列表页,解析出详情页的URL地址,放到任务队列
for(Selectable selectable : list){
//获取url地址
String jobInfoUrl = selectable.links().toString();
//把获取到的url地址放到任务队列中
page.addTargetRequest(jobInfoUrl);
}
//获取下一页的url
String bkUrl = page.getHtml().css("div.p_in li.bk").nodes().get(1).links().toString();
System.out.println(bkUrl);
//把获取到的url地址放到任务队列中
page.addTargetRequest(bkUrl);
}
String html = page.getHtml().toString();
}
private void saveJobInfo(Page page) {
JobInfo jobInfo = new JobInfo();
//解析页面
Html html = page.getHtml();
//获取数据
jobInfo.setCompanyName(html.css("div.cn p.cname a","text").toString());
jobInfo.setCompanyAddr(Jsoup.parse(html.css("div.bmsg").nodes().get(1).toString()).text());
jobInfo.setCompanyInfo(Jsoup.parse(html.css("div.tmsg").toString()).text());
jobInfo.setJobName(html.css("div.cn h1","text").toString());
jobInfo.setJobInfo(Jsoup.parse(html.css("div.job_msg").toString()).text());
jobInfo.setJobAddr(html.css("div.cn span.lname","text").toString());
jobInfo.setUrl(page.getUrl().toString());
//获取薪资
//获取发布时间
//String time = Jsoup.parse(html.css("div.cn p.ltype").regex(".*发布").toString()).text();
String time = "10-17";
jobInfo.setTime(time);
//把结果保存起来
page.putField("jobInfo",jobInfo);
}
private Site site = Site.me()
.setCharset("gbk")//设置编码
.setTimeOut(10*1000)//设置超时时间
.setRetrySleepTime(3000)//设置重试的间隔时间
.setRetryTimes(3);//设置重试次数
@Override
public Site getSite() {
return site;
}
@Autowired
private SpringDataPipeline springDataPipeline;
@Scheduled(initialDelay = 1000,fixedDelay = 100 * 1000)
public void process(){
//创建下载器
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
//给下载器设置代理服务器信息
httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("101.110.119.70",80)));
Spider.create(new JobTask())
.addUrl(url)
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
.thread(10)
.addPipeline(springDataPipeline)
// .setDownloader(httpClientDownloader)
.run();
}
}
8.获取保存的实体类,并保存进数据库
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
@Component
public class SpringDataPipeline implements Pipeline {
@Autowired
private JobInfoService jobInfoService;
@Override
public void process(ResultItems resultItems, Task task) {
//获取封装好的招聘详情对象
JobInfo jobInfo = resultItems.get("jobInfo");
//判断数据是否不为空
if (jobInfo != null){
//如果不为空把数据保存到数据库中
this.jobInfoService.save(jobInfo);
}
}
}