java网页数据抓取实例
网页上面数据如下:
如果想要过去上图所示网页的数据,代码如下:
(1)调度类,主要调用工具类中的方法获取数据并入库
package com.jointsky.jointframe.weather.jobservice;import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.springframework.transaction.annotation.Transactional;
import com.jointsky.jointframe.scheduler.exception.ExecutionException;
import com.jointsky.jointframe.scheduler.quartz.JobService;
import com.jointsky.jointframe.weather.entity.ActuallyForecastWeather;
import com.jointsky.jointframe.weather.entity.ActuallyWeather;
import com.jointsky.jointframe.weather.service.ActuallyForecastWeatherManager;
import com.jointsky.jointframe.weather.service.ActuallyWeatherManager;
import com.jointsky.jointframe.weather.utils.UrlInfo;
/*
* <p>Description:实况天气资料数据资料调度类</p>*/
@Transactional
public class ActuallyWeatherJobService implements JobService{
/**
* 实况天气资料(当天)数据管理类
*/
private ActuallyWeatherManager actuallyWeatherManager;
/**
* 实况天气资料(当天)数据实体类
*/
private ActuallyWeather actuallyWeather;
/**
* 实况天气资料预报数据管理类
*/
private ActuallyForecastWeatherManager actuallyForecastWeatherManager;
/**
* 实况天气资料预报数据实体类
*/
private ActuallyForecastWeather actuallyForecastWeather;
/**
* 存放参数的map集合
*/
private Map<String, Object> map = new HashMap<String, Object>();
@Override
public void execute(Map<String, Object> arg0) throws ExecutionException {
System.out.println("实况天气资料数据获取调度成功");
String[] countyCodes = UrlInfo.getCountyCodes().split(",");
for (int j = 0; j < countyCodes.length; j++) {
String countyCode = countyCodes[j];
if (StringUtils.isNotEmpty(countyCode)) {
try {
String url = UrlInfo.getUrl(countyCode);
Map<String, Object> filterMap = UrlInfo.getDistrict(countyCode);
//市级名称
String cityLevel = (String) filterMap.get("cityLevel");
//区县级名称
String countyLevel = (String) filterMap.get("countyLevel");
//银川天气预报详细信息数据
List<ActuallyWeather> list_actually = UrlInfo.getURLInfoOfActully(url,"utf-8");
List<ActuallyForecastWeather> list_forecast = UrlInfo.getURLInfoOfForecast(url, "utf-8");
for (int i = 0; i < list_actually.size(); i++) {
actuallyWeather = list_actually.get(i);
actuallyWeather.setCityLevel(cityLevel);
actuallyWeather.setCountyLevel(countyLevel);
//预报时间
if (StringUtils.isNotEmpty(actuallyWeather.getForecastTime())) {
map.put("forecastTime", actuallyWeather.getForecastTime());
}
//地名
if (StringUtils.isNotEmpty(actuallyWeather.getPlaceName())) {
map.put("placeName", actuallyWeather.getPlaceName());
}
String actuallyWeatherId = actuallyWeatherManager.findIdByParams(map);
map = new HashMap<String, Object>();
if (StringUtils.isNotEmpty(actuallyWeatherId)) {
actuallyWeather.setId(actuallyWeatherId);
actuallyWeatherManager.updateWeather(actuallyWeather);
}else {
actuallyWeatherManager.save(actuallyWeather);
}
}
for (int i = 0; i < list_forecast.size(); i++) {
actuallyForecastWeather = list_forecast.get(i);
actuallyForecastWeather.setCityLevel(cityLevel);
actuallyForecastWeather.setCountyLevel(countyLevel);
//预报时间
if (StringUtils.isNotEmpty(actuallyForecastWeather.getForecastTime())) {
map.put("forecastTime", actuallyForecastWeather.getForecastTime());
}
//地名
if (StringUtils.isNotEmpty(actuallyForecastWeather.getPlaceName())) {
map.put("placeName", actuallyForecastWeather.getPlaceName());
}
String actuallyForecastWeatherId = actuallyForecastWeatherManager.findIdByParams(map);
map = new HashMap<String, Object>();
if (StringUtils.isNotEmpty(actuallyForecastWeatherId)) {
actuallyForecastWeather.setId(actuallyForecastWeatherId);
actuallyForecastWeatherManager.updateForecastWeather(actuallyForecastWeather);
}else {
actuallyForecastWeatherManager.save(actuallyForecastWeather);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
public ActuallyWeatherManager getActuallyWeatherManager() {
return actuallyWeatherManager;
}
public void setActuallyWeatherManager(
ActuallyWeatherManager actuallyWeatherManager) {
this.actuallyWeatherManager = actuallyWeatherManager;
}
public ActuallyWeather getActuallyWeather() {
return actuallyWeather;
}
public void setActuallyWeather(ActuallyWeather actuallyWeather) {
this.actuallyWeather = actuallyWeather;
}
public Map<String, Object> getMap() {
return map;
}
public void setMap(Map<String, Object> map) {
this.map = map;
}
public ActuallyForecastWeatherManager getActuallyForecastWeatherManager() {
return actuallyForecastWeatherManager;
}
public void setActuallyForecastWeatherManager(
ActuallyForecastWeatherManager actuallyForecastWeatherManager) {
this.actuallyForecastWeatherManager = actuallyForecastWeatherManager;
}
public ActuallyForecastWeather getActuallyForecastWeather() {
return actuallyForecastWeather;
}
public void setActuallyForecastWeather(
ActuallyForecastWeather actuallyForecastWeather) {
this.actuallyForecastWeather = actuallyForecastWeather;
}
}
View Code
(2)工具类,主要为一些执行查询数据的实现方法
package com.jointsky.jointframe.weather.utils;import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import com.jointsky.jointframe.weather.entity.ActuallyForecastWeather;
import com.jointsky.jointframe.weather.entity.ActuallyWeather;
/**
* <p>Description:实况天气资料工具类</p>*/
public class UrlInfo {
/**
* 生成一个Pattern,同时编译一个正则表达式
*/
private static Pattern proInfo = Pattern.compile("<font>(.*?)</font>", Pattern.DOTALL);
/**
* 宁夏区县编码(总乡镇数190)
* 银川{市辖区(11个乡镇):53614;贺兰县(7):53610;永宁县(8):53618;灵武市(8):53619}
*/
private static String countyCodes = "53614,53610,53618,53619";
/**
* 获取实况天气(当天)数据的方法
* @param urlInfo
* @param charset
* @return
* @throws Exception
*/
public static List<ActuallyWeather> getURLInfoOfActully(String urlInfo,String charset) throws Exception {
String info = getUrlInfo(urlInfo);
//获得网页源码(0是当天)
return getDataStructure(info,0);
}
/**
* 获取实况天气(预报)数据的方法
* @param urlInfo
* @param charset
* @return
* @throws Exception
*/
public static List<ActuallyForecastWeather> getURLInfoOfForecast(String urlInfo,String charset) throws Exception {
String info = getUrlInfo(urlInfo);
//获得网页源码(1是预报)
return getDataStructure(info,1);
}
/**
* 网页信息
* @param urlInfo
* @return
* @throws Exception
*/
public static String getUrlInfo(String urlInfo) throws Exception {
//读取目的网页URL地址,获取网页源码
URL url = new URL(urlInfo);
HttpURLConnection httpUrl = (HttpURLConnection)url.openConnection();
InputStream is = httpUrl.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is,"utf-8"));
StringBuilder sb = new StringBuilder();
String line;
while ((line = br.readLine()) != null) {
//这里是对链接进行处理
line = line.replaceAll("</?a[^>]*>", "");
//这里是对样式进行处理
line = line.replaceAll("<(\\w+)[^>]*>", "<$1>");
sb.append(line);
}
is.close();
br.close();
return sb.toString().trim();
}
private static List getDataStructure(String str,int j) {
//运用正则表达式对获取的网页源码进行数据匹配,提取我们所要的数据,在以后的过程中,我们可以采用httpclient+jsoup,
//现在暂时运用正则表达式对数据进行抽取提取
//String[] info = str.split("</li>");
SimpleDateFormat sf = new SimpleDateFormat("HH");
Date dateTime = new Date();
String hour = sf.format(dateTime);
Integer h = Integer.parseInt(hour);
int t = 0;
//如果十二点之前当天会有四个时间段模块(今天上午6~12;今天下午12~18;今天前半夜18~24;今天后半夜次日00~06)
if (h<=12) {
t=4;
//如果十二点之后十八点之前当天会有三个时间段模块(今天下午12~18;今天前半夜18~24;今天后半夜次日00~06)
}else if (12<h&&h<=18) {
t=3;
//如果十八点之后当天会有两个时间段模块(今天前半夜18~24;今天后半夜次日00~06)
}else if(h>18) {
t=2;
}
String[] info = str.split("<th>");
List<ActuallyWeather> list_actually = new ArrayList<ActuallyWeather>();
List<ActuallyForecastWeather> list_forecast = new ArrayList<ActuallyForecastWeather>();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
//当t的值是几的时候相应的当天的天气数据就还剩下几个模块,k就是用来控制第几个模块数据的参数
int k = 0;
for (String s : info) {
//这个Pattern对象将会使用matcher()方法来生成一个Matcher实例,接着便可以使用该 Matcher实例以编译的正则表达式为基础对目标字符串进行匹配工作,多个Matcher是可以共用一个Pattern对象的。
Matcher m = proInfo.matcher(s);
ActuallyWeather actually = null;
ActuallyForecastWeather forecast = null;
//使用find()方法查找第一个匹配的对象
if (m.find()) {
actually = new ActuallyWeather();
forecast = new ActuallyForecastWeather();
//返回与组匹配的子串内容
String[] ss = m.group(1).trim().replace(" ", "").split(">");
if ("风力".equals(ss[0])) {
k++;
String[] strsss = s.split("<td>");
int i = 0;
if (k<=t&&j==0) {
actually = new ActuallyWeather();
for (String ss1 : strsss) {
Matcher mm = proInfo.matcher(ss1);
if (mm.find()) {
//设置产品型号
String[] sss = mm.group(1).trim().replace(" ", "").split(">");
if (i%4==1) {
actually.setPlaceName(sss[0]);
}else if (i%4==2) {
actually.setWeatherStatus(sss[0]);
}else if (i%4==3) {
String temp = sss[0];
String[] temps = temp.split("/");
actually.setMaxTemperature(temps[0]+"℃");
actually.setMinTemperature(temps[1]);
}else if (i%4==0&&i!=0) {
actually.setWindPower(sss[0]);
}
}
if (i%4==0&&i!=0) {
Date date = new Date();
//发布日期
actually.setPubTime(sdf.format(date));
//前四模块数据从当天早上七点开始加六个小时
//date = new Date(date.getTime() + (k-1)*21600000);
int p = 0;
//t是当日数据剩余次数;k是当前循环次数
if ((t-k)==3) {
p = 6;
}else if ((t-k)==2) {
p = 12;
}else if ((t-k)==1) {
p = 18;
}else if ((t-k)==0) {
p = 24;
}
//次日凌晨
if (24==p) {
Date time = new Date(date.getTime() + 86400000);
actually.setForecastTime(sdf.format(time)+" 00");
}else if (p<10) {
actually.setForecastTime(sdf.format(date)+" 0"+p);
}else if (p>10&&p!=24) {
actually.setForecastTime(sdf.format(date)+" "+p);
}
list_actually.add(actually);
actually=new ActuallyWeather();
}
i++;
}
}else if (k>t&&j==1) {
forecast = new ActuallyForecastWeather();
for (String ss1 : strsss) {
Matcher mm = proInfo.matcher(ss1);
if (mm.find()) {
//设置产品型号
String[] sss = mm.group(1).trim().replace(" ", "").split(">");
if (i%4==1) {
forecast.setPlaceName(sss[0]);
}else if (i%4==2) {
forecast.setWeatherStatus(sss[0]);
}else if (i%4==3) {
String temp = sss[0];
String[] temps = temp.split("/");
forecast.setMaxTemperature(temps[0]+"℃");
forecast.setMinTemperature(temps[1]);
}else if (i%4==0&&i!=0) {
forecast.setWindPower(sss[0]);
}
}
if (i%4==0&&i!=0) {
Date date = new Date();
//发布日期
forecast.setPubTime(sdf.format(date));
//从第五个模块数据开始数据是从次日凌晨开始每下一组比上一组晚24小时
date = new Date(date.getTime() + (k-t)*86400000);
forecast.setForecastTime(sdf.format(date)+" 00");
list_forecast.add(forecast);
forecast=new ActuallyForecastWeather();
}
i++;
}
}
}
}
}
if (0==j) {
return list_actually;
}else if (1==j) {
return list_forecast;
}
return null;
}
/**
* 生成url
* @param countyCode
* @return
* @throws Exception
*/
public static String getUrl(String countyCode) throws Exception {
String url = "http://3g.nx121.com/pc/tqybxzb.aspx";
if (StringUtils.isNotEmpty(countyCode)) {
url = url + "?sd="+countyCode;
}
return url;
}
/**
* 根据城市编码查询所属市级和区县级行政区
* @param countyCode
* @return
* @throws Exception
*/
public static Map<String, Object> getDistrict(String countyCode) throws Exception {
Map<String, Object> map = new HashMap<String, Object>();
if ("53614".equals(countyCode)) {
//市级行政区
map.put("cityLevel", "银川市");
//区县级行政区
map.put("countyLevel", "市辖区");
}else if ("53610".equals(countyCode)) {
//市级行政区
map.put("cityLevel", "银川市");
//区县级行政区
map.put("countyLevel", "贺兰县");
}else if ("53618".equals(countyCode)) {
//市级行政区
map.put("cityLevel", "银川市");
//区县级行政区
map.put("countyLevel", "永宁县");
}else if ("53619".equals(countyCode)) {
//市级行政区
map.put("cityLevel", "银川市");
//区县级行政区
map.put("countyLevel", "灵武市");
}
return map;
}
public static Pattern getProInfo() {
return proInfo;
}
public static void setProInfo(Pattern proInfo) {
UrlInfo.proInfo = proInfo;
}
public static String getCountyCodes() {
return countyCodes;
}
public static void setCountyCodes(String countyCodes) {
UrlInfo.countyCodes = countyCodes;
}
}
View Code
(3)实体类,用于存放数据的bean
package com.jointsky.jointframe.weather.entity;import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Table;
import org.apache.commons.lang.builder.EqualsBuilder;
import org.apache.commons.lang.builder.HashCodeBuilder;
import org.apache.commons.lang.builder.ToStringBuilder;
import org.apache.commons.lang.builder.ToStringStyle;
import org.hibernate.annotations.Cache;
import org.hibernate.annotations.CacheConcurrencyStrategy;
import com.jointsky.jointframe.core.entity.IdEntity;
/**
* <p>Description:实况天气资料(当天)Entity类</p>*/
@Entity
@Table(name = "t_actually_weather")
@Cache(usage = CacheConcurrencyStrategy.READ_WRITE)
public class ActuallyWeather extends IdEntity {
/**
*
*/
private static final long serialVersionUID = -5324072662712469478L;
/**
* 市级行政区(名称)
*/
private String cityLevel;
/**
* 区县级行政区名(名称)
*/
private String countyLevel;
/**
* 发布时间
*/
private String pubTime;
/**
* 地名
*/
private String placeName;
/**
* 天气状态:多云、晴、小雨......
*/
private String weatherStatus;
/**
* 最高温度
*/
private String maxTemperature;
/**
* 最低温度
*/
private String minTemperature;
/**
* 风力
*/
private String windPower;
/**
* 预报时间
*/
private String forecastTime;
@Column(name="pub_time",length=50)
public String getPubTime() {
return pubTime;
}
public void setPubTime(String pubTime) {
this.pubTime = pubTime;
}
@Column(name="place_name",length=50)
public String getPlaceName() {
return placeName;
}
public void setPlaceName(String placeName) {
this.placeName = placeName;
}
@Column(name="weather_status",length=50)
public String getWeatherStatus() {
return weatherStatus;
}
public void setWeatherStatus(String weatherStatus) {
this.weatherStatus = weatherStatus;
}
@Column(name="max_temperature",length=50)
public String getMaxTemperature() {
return maxTemperature;
}
public void setMaxTemperature(String maxTemperature) {
this.maxTemperature = maxTemperature;
}
@Column(name="min_temperature",length=50)
public String getMinTemperature() {
return minTemperature;
}
public void setMinTemperature(String minTemperature) {
this.minTemperature = minTemperature;
}
@Column(name="wind_power",length=50)
public String getWindPower() {
return windPower;
}
public void setWindPower(String windPower) {
this.windPower = windPower;
}
public static long getSerialversionuid() {
return serialVersionUID;
}
@Column(name="forecast_time",length=50)
public String getForecastTime() {
return forecastTime;
}
public void setForecastTime(String forecastTime) {
this.forecastTime = forecastTime;
}
@Override
public String toString() {
return new ToStringBuilder(this, ToStringStyle.MULTI_LINE_STYLE)
.append("id", id).toString();
}
@Column(name="city_level",length=50)
public String getCityLevel() {
return cityLevel;
}
public void setCityLevel(String cityLevel) {
this.cityLevel = cityLevel;
}
@Column(name="county_level",length=50)
public String getCountyLevel() {
return countyLevel;
}
public void setCountyLevel(String countyLevel) {
this.countyLevel = countyLevel;
}
//MeetingArrangement是当前实体
@Override
public boolean equals(Object o) {
boolean equal = false;
if (o != null && ActuallyWeather.class.isAssignableFrom(o.getClass())) {
ActuallyWeather actuallyWeather = (ActuallyWeather) o;
equal = (new EqualsBuilder().append(this.id, actuallyWeather.id)).isEquals();
}
return equal;
}
@Override
public int hashCode() {
return new HashCodeBuilder(17, 37).append(id).toHashCode();
}
}
View Code
预报实体类和当天的字段完全一致,不过表名不一致。
参考文档:http://www.cnblogs.com/shuilangyizu/p/6595588.html
以上是 java网页数据抓取实例 的全部内容, 来源链接: utcz.com/z/392854.html