- 目录
WebMagic 官网 http://webmagic.io/
根据官方给的案例GithubRepoPageProcessor(测试案例不能直接运行,网络认证的关系.没啥事). 只要能启动 跑起来说明项目没问题.
抓取http://lianhanghao.com/index.php/Index/index/p/1.html 所有的联行号
-
WebMagic 官网 http://webmagic.io/
-
导入依赖
-
us.codecraft webmagic-core 0.7.3 us.codecraft webmagic-extension 0.7.3 com.fasterxml.jackson.core jackson-databind 2.9.4 compile commons-logging commons-logging 1.2 org.projectlombok lombok true com.danrenying base 1.0-SNAPSHOT compile
-
根据官方给的案例GithubRepoPageProcessor(测试案例不能直接运行,网络认证的关系.没啥事). 只要能启动 跑起来说明项目没问题.
-
public class GithubRepoPageProcessor implements PageProcessor { //配置信息 重试的次数 休眠的时间 超时的时间 private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000); @Override //处理的逻辑 public void process(Page page) { //通过page对象获取html的代码,然后从html中获取需要的数据 //添加addTargetRequests中,执行完成之后会继续执行.执行完当前的页面在获取新页面 page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); //抓取的数据使用putField 通过key value的方式存储 page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); //跳过 if (page.getResultItems().get("name")==null){ //skip this page page.setSkip(true); } page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); } @Override public Site getSite() { return site; } public static void main(String[] args) { //通过create启动爬虫, 添加入口的url. 设置线程数.--> 访问之后就会拿到相应中的html 然后就回去执行process 方法. Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); }} 抓取http://lianhanghao.com/index.php/Index/index/p/1.html 所有的联行号
package com.danrenying.web.sendtips.base.utils; import java.util.regex.Pattern; / * @program: 校验工具类 * @description: * @author: 单人影 * @create: 2020/11/28 / public class ValiateUtils { / * 判断是否为数字格式不限制位数 * * @param o * 待校验参数 * @return 如果全为数字,返回true;否则,返回false */ private static final Pattern numberPpattern = Pattern.compile("[0-9]*"); public static boolean isNumber(Object content) { return numberPpattern.matcher(String.valueOf(content)).matches(); } }
使用的工具类
获取页面数据代码
package com.danrenying.web.webmagic; import com.danrenying.web.sendtips.base.utils.ValiateUtils; import lombok.extern.slf4j.Slf4j; import org.jsoup.Jsoup; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @Slf4j public class BankPageProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(5).setSleepTime(1000).setTimeOut(30000); private static String BODY = "body"; @Override public void process(Page page) { //得到当前页面数据 /* nodename 选取此节点的所有子节点 / 从当前节点选区直接子节点 // 从当前节点选取子孙节点 . 选取当前节点 .. 选取当前节点的父节点 @ 选取属性 */ Listhtmls = page.getHtml().xpath("//tbody/tr").all(); for (String html1 : htmls) { String title = seletDocumentText(html1, BODY); String[] split = title.split(" "); if (split.length != 2) { continue; } if (!ValiateUtils.isNumber(split[0])) { continue; } page.putField(split[0], split[1]); } //循环访问剩下的页面 for (int i = 1; i <= 12615; i++) { page.addTargetRequest("http://lianhanghao.com/index.php/Index/index/p/" + i + ".html"); } } @Override public Site getSite() { return site; } public static void main(String[] args) { bank(); } public static void bank() { Spider.create(new BankPageProcessor()) .addUrl("http://lianhanghao.com/index.php/Index/index/p/1.html") .addPipeline(new MyPipeline()) .thread(1).run(); } / * jsoup根据 html 字符串和语法获取内容; * * @param htmlText * @return * @date 2017年8月31日 */ private static String seletDocumentText(String htmlText, String Query) { String select = Jsoup.parse(htmlText).select(Query).text(); return select; } } -
处理类:
-
package com.danrenying.web.webmagic; import lombok.extern.slf4j.Slf4j; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.Map; @Slf4j public class MyPipeline implements Pipeline { private static Connection connection = null; //初始链接 private static Connection getConnection() { if (connection == null) { connection = MySqlJdbcUtils.getOpenConnection(); } return connection; } @Override public void process(ResultItems resultItems, Task task) { // INSERT INTO basic_openbankinfo( OPENBANKNAME, OPENBANK, BANKNAME, CITYNAME, PROVINCENAME, BANKCODE, CITYCODE, PROVINCECODE, MODIFYTIME, COUNTYCODE, COUNTYNAME) VALUES ( '中国建设银行股份有限公司咸阳分行营业部', '7', '中国建设银行', '咸阳市', '陕西', '005', '6104', '61', NULL, NULL, NULL); String sql = ""; Mapmaps = resultItems.getAll(); for (String key : maps.keySet()) { maps.get(key); sql = " INSERT INTO basic_openbankinfo_bak1( OPENBANKNAME, OPENBANK, BANKNAME, CITYNAME, PROVINCENAME, BANKCODE, CITYCODE, PROVINCECODE, MODIFYTIME, COUNTYCODE, COUNTYNAME) VALUES ( '" + maps.get(key) + "','" + key + "', NULL, NULL, NULL, NULL, NULL,NULL, NULL, NULL, NULL)"; try { Connection connection = getConnection(); PreparedStatement preparedStatement = connection.prepareStatement(sql); int i = preparedStatement.executeUpdate(); log.info("执行SQL语句:{},返回影响结果条数:{}", sql, i); System.out.println("执行SQL语句" + sql + ",返回影响结果条数" + i); } catch (SQLException e) { e.printStackTrace(); } } } }
数据库连接工具:
package com.danrenying.web.webmagic; import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; public class MySqlJdbcUtils { private static String driver = "com.mysql.jdbc.Driver"; private static String url = "jdbc:mysql://xxx.xx.xx.xxx:3306/data?useUnicode=true&characterEncoding=utf-8"; private static String name="root"; private static String pwd="xxxx"; / * * 获取链接 * * @date 2017年8月31日 * @return */ public static Connection getOpenConnection(){ Connection conn= null; try { //加载驱动 Class.forName(driver); conn=(Connection) DriverManager.getConnection(url, name, pwd); System.out.println("获得数据库链接"); } catch (ClassNotFoundException e) { e.printStackTrace(); }catch (SQLException e) { e.printStackTrace(); } return conn; } public static void main(String[] args) { getOpenConnection(); } }
爬取上海链家的租房信息
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import java.util.List;
public class LianjiaPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
@Override
public void process(Page page) {
Html html = page.getHtml();
// 房源详情连接 使用样式选择器-->.content__list--item--title 下面的 a标签
List
list = html.css(".content__list--item--title a").links().all(); page.addTargetRequests(list); //获取标题 主要区别列表页和详情页的标题属性. String title = html.xpath("//div[@class='content clear w1150']/p/text()").toString(); page.putField("title", title); page.putField("rent", html.xpath("//p[@class='content__aside--title']/span/text()").toString()); page.putField("type", html.xpath("//p[@class='content__article__table']/allText()").toString()); page.putField("info", html.xpath("//div[@class='content__article__info']/allText()").toString()); page.putField("img", html.xpath("//div[@class='content__article__slide__item']/img").toString()); if(page.getResultItems().get("title") == null){ page.setSkip(true); // 分页连接 //框架内部会自动去重.抓取过的就不会重新抓取了. for (int i = 1; i <= 100; i++) { page.addTargetRequest("https://sh.lianjia.com/zufang/pg"+i); } } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new LianjiaPageProcessor()) .addUrl("https://sh.lianjia.com/zufang/") .addPipeline(new MyPipeline()) .thread(1).run(); } }
- 页面样式
- 分页:
- 这样就直接修改pg后面的数字就可以了.
- 这样WebMagic抓取数据就完成了. 获取到的数据 看你自己怎么处理.
- 处理数据需要实现Pipeline 接口爬取的数据就写在resultItems里面.跳过的不会进入Pipeline 里.
-
import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.HttpClientBuilder; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import java.io.File; import java.util.HashMap; import java.util.Map; public class MyPipeline implements Pipeline { private static final ObjectMapper MAPPER = new ObjectMapper(); @Override public void process(ResultItems resultItems, Task task) { Mapdata = new HashMap<>(); data.put("url", resultItems.getRequest().getUrl()); data.put("title", resultItems.get("title"));//标题 data.put("rent", resultItems.get("rent"));//租金 String[] types = StringUtils.split(resultItems.get("type"), ' '); data.put("rentMethod", types[0]);//租赁方式 data.put("houseType", types[1]);//户型,如:2室1厅1卫 data.put("orientation", types[2]);//朝向 String[] infos = StringUtils.split(resultItems.get("info"), ' '); for (String info : infos) { if (StringUtils.startsWith(info, "看房:")) { data.put("time", StringUtils.split(info, ':')[1]); } else if (StringUtils.startsWith(info, "楼层:")) { data.put("floor", StringUtils.split(info, ':')[1]); } } //下载房源图片到本地 String imageUrl = StringUtils.split(resultItems.get("img"), '"')[3]; String newName = StringUtils .substringBefore(StringUtils .substringAfterLast(resultItems.getRequest().getUrl(), "/"), ".") + ".jpg"; try { this.downloadFile(imageUrl, new File("F:\\code\\images\\" + newName)); data.put("image", newName); String json = MAPPER.writeValueAsString(data); FileUtils.write(new File("F:\\code\\data.json"), json + "\n", "UTF-8", true); } catch (Exception e) { e.printStackTrace(); } } / * 下载文件 * * @param url 文件url * @param dest 目标目录 * @throws Exception */ public void downloadFile (String url, File dest) throws Exception { HttpGet httpGet = new HttpGet(url); CloseableHttpResponse response = HttpClientBuilder.create().build().execute(httpGet); try { FileUtils.writeByteArrayToFile(dest, IOUtils.toByteArray(response.getEntity().getContent())); } finally { response.close(); } } }
发布者:全栈程序员-站长,转载请注明出处:https://javaforall.net/214836.html原文链接:https://javaforall.net
