java jsoup 网络爬虫 学习例子(四) 抓取网页连接插入mysql数据库
package com.iteye.injavawetrust.jsoup; import java.io.IOException; import java.util.Iterator; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; / * * @author InJavaWeTrust * */ public class GetLink { private JsoupUtil ju = JsoupUtil.getInstance(); private DBUtil du = DBUtil.getInstance(); private Link link = new Link(); private String insertSql = ""; public void getLink(String url) { try { Document document = Jsoup.connect(url).timeout(5000).get(); Elements hrefs = document.select("a[href]"); Iterator
hrefIter = hrefs.iterator(); while (hrefIter.hasNext()) { Element href = hrefIter.next(); link.setId(ju.getUUID()); link.setUrlName(href.text()); link.setUrl(href.attr("href")); insertSql = ju.getInsertSql(link); du.insert(insertSql); } Elements srcs = document.select("img[src]"); Iterator
srcIter = srcs.iterator(); while(srcIter.hasNext()){ Element src = srcIter.next(); link.setId(ju.getUUID()); link.setUrlName(src.attr("alt")); link.setUrl(src.attr("src")); insertSql = ju.getInsertSql(link); du.insert(insertSql); } Elements opts = document.select("option[value]"); Iterator
optIter = opts.iterator(); while(optIter.hasNext()){ Element opt = optIter.next(); link.setId(ju.getUUID()); link.setUrlName(opt.text()); link.setUrl(opt.attr("value")); insertSql = ju.getInsertSql(link); du.insert(insertSql); } Elements links = document.select("link[href]"); Iterator
linkIter = links.iterator(); while(linkIter.hasNext()){ Element li = linkIter.next(); link.setId(ju.getUUID()); link.setUrlName(li.text()); link.setUrl(li.attr("href")); insertSql = ju.getInsertSql(link); du.insert(insertSql); } } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) { new GetLink().getLink(Constants.URL); } } package com.iteye.injavawetrust.jsoup; import java.io.Serializable; import java.util.Date; / * * @author InJavaWeTrust * */ public class Link implements Serializable{ private static final long serialVersionUID = 1165098694307553167L; / * ID */ private String id; / * link name */ private String urlName; / * link url */ private String url; / * insert db date */ private Date date; public String getId() { return id; } public void setId(String id) { this.id = id; } public String getUrlName() { return urlName; } public void setUrlName(String urlName) { this.urlName = urlName; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public Date getDate() { return date; } public void setDate(Date date) { this.date = date; } } package com.iteye.injavawetrust.jsoup; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; / * * @author InJavaWeTrust * */ public class DBUtil { private static Connection conn = null; private static Statement st = null; private static ResultSet rs = null; private DBUtil() { } private static final DBUtil instance = new DBUtil(); public static DBUtil getInstance() { return instance; } / * 连接数据库 * * @return */ public Connection connection() { try { Class.forName(Constants.DRIVER); } catch (ClassNotFoundException e1) { e1.printStackTrace(); } try { conn = DriverManager.getConnection(Constants.DBURL, Constants.USER, Constants.PASSWORD); } catch (SQLException e) { e.printStackTrace(); } return conn; } / * 关闭连接 * * @param rs * @param st * @param conn */ public void release(ResultSet rs, Statement st, Connection conn) { try { try { if (null != rs) { rs.close(); } } catch (Exception e) { rs = null; } try { if (null != st) { st.close(); } } catch (Exception e) { st = null; } try { if (null != conn) { conn.close(); } } catch (Exception e) { conn = null; } } finally { rs = null; st = null; conn = null; } } / * 插入 * @param sql */ public void insert(String sql){ try{ DBUtil.getInstance().connection(); st = conn.createStatement(); st.execute(sql); DBUtil.getInstance().release(rs, st, conn); }catch(Exception e){ e.printStackTrace(); } } } package com.iteye.injavawetrust.jsoup; / * * @author InJavaWeTrust * */ public class Constants { / * mysql 驱动 */ public static final String DRIVER = "com.mysql.jdbc.Driver"; / * 链接 */ public static final String DBURL = "jdbc:mysql://localhost:3306/jsoupdb?useUnicode=true&characterEncoding=utf-8"; / * username */ public static final String USER = "root"; / * password */ public static final String PASSWORD = "root"; / * 随便找的一个URL */ public static final String URL = "http://www.hrbhuade.net/html/main/index.htm"; } package com.iteye.injavawetrust.jsoup; import java.util.UUID; / * * @author InJavaWeTrust * */ public class JsoupUtil { private JsoupUtil() { } private static final JsoupUtil instance = new JsoupUtil(); public static JsoupUtil getInstance() { return instance; } / * 得到UUID * @return 32位UUID */ public String getUUID() { String s = UUID.randomUUID().toString(); return s.substring(0, 8) + s.substring(9, 13) + s.substring(14, 18) + s.substring(19, 23) + s.substring(24); } / * insert sql * @param link Link obj * @return sql */ public String getInsertSql(Link link) { return "insert into link (id, urlname, url, date) values ('" + link.getId() + "','" + link.getUrlName() + "','" + link.getUrl() + "',NOW())"; } }
link 表
DROP TABLE IF EXISTS `link`; CREATE TABLE `link` ( `id` varchar(32) NOT NULL, `urlname` varchar(200) DEFAULT NULL, `url` varchar(200) DEFAULT NULL, `date` datetime DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
发布者:全栈程序员-站长,转载请注明出处:https://javaforall.net/226244.html原文链接:https://javaforall.net
