java网页数据抓取实例

java

网页上面数据如下:

如果想要过去上图所示网页的数据,代码如下:

(1)调度类,主要调用工具类中的方法获取数据并入库

package com.jointsky.jointframe.weather.jobservice;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import org.apache.commons.lang.StringUtils;

import org.springframework.transaction.annotation.Transactional;

import com.jointsky.jointframe.scheduler.exception.ExecutionException;

import com.jointsky.jointframe.scheduler.quartz.JobService;

import com.jointsky.jointframe.weather.entity.ActuallyForecastWeather;

import com.jointsky.jointframe.weather.entity.ActuallyWeather;

import com.jointsky.jointframe.weather.service.ActuallyForecastWeatherManager;

import com.jointsky.jointframe.weather.service.ActuallyWeatherManager;

import com.jointsky.jointframe.weather.utils.UrlInfo;

/*

* <p>Description:实况天气资料数据资料调度类</p>*/

@Transactional

public class ActuallyWeatherJobService implements JobService{

/**

* 实况天气资料(当天)数据管理类

*/

private ActuallyWeatherManager actuallyWeatherManager;

/**

* 实况天气资料(当天)数据实体类

*/

private ActuallyWeather actuallyWeather;

/**

* 实况天气资料预报数据管理类

*/

private ActuallyForecastWeatherManager actuallyForecastWeatherManager;

/**

* 实况天气资料预报数据实体类

*/

private ActuallyForecastWeather actuallyForecastWeather;

/**

* 存放参数的map集合

*/

private Map<String, Object> map = new HashMap<String, Object>();

@Override

public void execute(Map<String, Object> arg0) throws ExecutionException {

System.out.println("实况天气资料数据获取调度成功");

String[] countyCodes = UrlInfo.getCountyCodes().split(",");

for (int j = 0; j < countyCodes.length; j++) {

String countyCode = countyCodes[j];

if (StringUtils.isNotEmpty(countyCode)) {

try {

String url = UrlInfo.getUrl(countyCode);

Map<String, Object> filterMap = UrlInfo.getDistrict(countyCode);

//市级名称

String cityLevel = (String) filterMap.get("cityLevel");

//区县级名称

String countyLevel = (String) filterMap.get("countyLevel");

//银川天气预报详细信息数据

List<ActuallyWeather> list_actually = UrlInfo.getURLInfoOfActully(url,"utf-8");

List<ActuallyForecastWeather> list_forecast = UrlInfo.getURLInfoOfForecast(url, "utf-8");

for (int i = 0; i < list_actually.size(); i++) {

actuallyWeather = list_actually.get(i);

actuallyWeather.setCityLevel(cityLevel);

actuallyWeather.setCountyLevel(countyLevel);

//预报时间

if (StringUtils.isNotEmpty(actuallyWeather.getForecastTime())) {

map.put("forecastTime", actuallyWeather.getForecastTime());

}

//地名

if (StringUtils.isNotEmpty(actuallyWeather.getPlaceName())) {

map.put("placeName", actuallyWeather.getPlaceName());

}

String actuallyWeatherId = actuallyWeatherManager.findIdByParams(map);

map = new HashMap<String, Object>();

if (StringUtils.isNotEmpty(actuallyWeatherId)) {

actuallyWeather.setId(actuallyWeatherId);

actuallyWeatherManager.updateWeather(actuallyWeather);

}else {

actuallyWeatherManager.save(actuallyWeather);

}

}

for (int i = 0; i < list_forecast.size(); i++) {

actuallyForecastWeather = list_forecast.get(i);

actuallyForecastWeather.setCityLevel(cityLevel);

actuallyForecastWeather.setCountyLevel(countyLevel);

//预报时间

if (StringUtils.isNotEmpty(actuallyForecastWeather.getForecastTime())) {

map.put("forecastTime", actuallyForecastWeather.getForecastTime());

}

//地名

if (StringUtils.isNotEmpty(actuallyForecastWeather.getPlaceName())) {

map.put("placeName", actuallyForecastWeather.getPlaceName());

}

String actuallyForecastWeatherId = actuallyForecastWeatherManager.findIdByParams(map);

map = new HashMap<String, Object>();

if (StringUtils.isNotEmpty(actuallyForecastWeatherId)) {

actuallyForecastWeather.setId(actuallyForecastWeatherId);

actuallyForecastWeatherManager.updateForecastWeather(actuallyForecastWeather);

}else {

actuallyForecastWeatherManager.save(actuallyForecastWeather);

}

}

} catch (Exception e) {

e.printStackTrace();

}

}

}

}

public ActuallyWeatherManager getActuallyWeatherManager() {

return actuallyWeatherManager;

}

public void setActuallyWeatherManager(

ActuallyWeatherManager actuallyWeatherManager) {

this.actuallyWeatherManager = actuallyWeatherManager;

}

public ActuallyWeather getActuallyWeather() {

return actuallyWeather;

}

public void setActuallyWeather(ActuallyWeather actuallyWeather) {

this.actuallyWeather = actuallyWeather;

}

public Map<String, Object> getMap() {

return map;

}

public void setMap(Map<String, Object> map) {

this.map = map;

}

public ActuallyForecastWeatherManager getActuallyForecastWeatherManager() {

return actuallyForecastWeatherManager;

}

public void setActuallyForecastWeatherManager(

ActuallyForecastWeatherManager actuallyForecastWeatherManager) {

this.actuallyForecastWeatherManager = actuallyForecastWeatherManager;

}

public ActuallyForecastWeather getActuallyForecastWeather() {

return actuallyForecastWeather;

}

public void setActuallyForecastWeather(

ActuallyForecastWeather actuallyForecastWeather) {

this.actuallyForecastWeather = actuallyForecastWeather;

}

}

View Code

(2)工具类,主要为一些执行查询数据的实现方法

package com.jointsky.jointframe.weather.utils;

import java.io.BufferedReader;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.net.HttpURLConnection;

import java.net.URL;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Date;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;

import com.jointsky.jointframe.weather.entity.ActuallyForecastWeather;

import com.jointsky.jointframe.weather.entity.ActuallyWeather;

/**

* <p>Description:实况天气资料工具类</p>*/

public class UrlInfo {

/**

* 生成一个Pattern,同时编译一个正则表达式

*/

private static Pattern proInfo = Pattern.compile("<font>(.*?)</font>", Pattern.DOTALL);

/**

* 宁夏区县编码(总乡镇数190)

* 银川{市辖区(11个乡镇):53614;贺兰县(7):53610;永宁县(8):53618;灵武市(8):53619}

*/

private static String countyCodes = "53614,53610,53618,53619";

/**

* 获取实况天气(当天)数据的方法

* @param urlInfo

* @param charset

* @return

* @throws Exception

*/

public static List<ActuallyWeather> getURLInfoOfActully(String urlInfo,String charset) throws Exception {

String info = getUrlInfo(urlInfo);

//获得网页源码(0是当天)

return getDataStructure(info,0);

}

/**

* 获取实况天气(预报)数据的方法

* @param urlInfo

* @param charset

* @return

* @throws Exception

*/

public static List<ActuallyForecastWeather> getURLInfoOfForecast(String urlInfo,String charset) throws Exception {

String info = getUrlInfo(urlInfo);

//获得网页源码(1是预报)

return getDataStructure(info,1);

}

/**

* 网页信息

* @param urlInfo

* @return

* @throws Exception

*/

public static String getUrlInfo(String urlInfo) throws Exception {

//读取目的网页URL地址,获取网页源码

URL url = new URL(urlInfo);

HttpURLConnection httpUrl = (HttpURLConnection)url.openConnection();

InputStream is = httpUrl.getInputStream();

BufferedReader br = new BufferedReader(new InputStreamReader(is,"utf-8"));

StringBuilder sb = new StringBuilder();

String line;

while ((line = br.readLine()) != null) {

//这里是对链接进行处理

line = line.replaceAll("</?a[^>]*>", "");

//这里是对样式进行处理

line = line.replaceAll("<(\\w+)[^>]*>", "<$1>");

sb.append(line);

}

is.close();

br.close();

return sb.toString().trim();

}

private static List getDataStructure(String str,int j) {

//运用正则表达式对获取的网页源码进行数据匹配,提取我们所要的数据,在以后的过程中,我们可以采用httpclient+jsoup,

//现在暂时运用正则表达式对数据进行抽取提取

//String[] info = str.split("</li>");

SimpleDateFormat sf = new SimpleDateFormat("HH");

Date dateTime = new Date();

String hour = sf.format(dateTime);

Integer h = Integer.parseInt(hour);

int t = 0;

//如果十二点之前当天会有四个时间段模块(今天上午6~12;今天下午12~18;今天前半夜18~24;今天后半夜次日00~06)

if (h<=12) {

t=4;

//如果十二点之后十八点之前当天会有三个时间段模块(今天下午12~18;今天前半夜18~24;今天后半夜次日00~06)

}else if (12<h&&h<=18) {

t=3;

//如果十八点之后当天会有两个时间段模块(今天前半夜18~24;今天后半夜次日00~06)

}else if(h>18) {

t=2;

}

String[] info = str.split("<th>");

List<ActuallyWeather> list_actually = new ArrayList<ActuallyWeather>();

List<ActuallyForecastWeather> list_forecast = new ArrayList<ActuallyForecastWeather>();

SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");

//当t的值是几的时候相应的当天的天气数据就还剩下几个模块,k就是用来控制第几个模块数据的参数

int k = 0;

for (String s : info) {

//这个Pattern对象将会使用matcher()方法来生成一个Matcher实例,接着便可以使用该 Matcher实例以编译的正则表达式为基础对目标字符串进行匹配工作,多个Matcher是可以共用一个Pattern对象的。

Matcher m = proInfo.matcher(s);

ActuallyWeather actually = null;

ActuallyForecastWeather forecast = null;

//使用find()方法查找第一个匹配的对象

if (m.find()) {

actually = new ActuallyWeather();

forecast = new ActuallyForecastWeather();

//返回与组匹配的子串内容

String[] ss = m.group(1).trim().replace(" ", "").split(">");

if ("风力".equals(ss[0])) {

k++;

String[] strsss = s.split("<td>");

int i = 0;

if (k<=t&&j==0) {

actually = new ActuallyWeather();

for (String ss1 : strsss) {

Matcher mm = proInfo.matcher(ss1);

if (mm.find()) {

//设置产品型号

String[] sss = mm.group(1).trim().replace(" ", "").split(">");

if (i%4==1) {

actually.setPlaceName(sss[0]);

}else if (i%4==2) {

actually.setWeatherStatus(sss[0]);

}else if (i%4==3) {

String temp = sss[0];

String[] temps = temp.split("/");

actually.setMaxTemperature(temps[0]+"℃");

actually.setMinTemperature(temps[1]);

}else if (i%4==0&&i!=0) {

actually.setWindPower(sss[0]);

}

}

if (i%4==0&&i!=0) {

Date date = new Date();

//发布日期

actually.setPubTime(sdf.format(date));

//前四模块数据从当天早上七点开始加六个小时

//date = new Date(date.getTime() + (k-1)*21600000);

int p = 0;

//t是当日数据剩余次数;k是当前循环次数

if ((t-k)==3) {

p = 6;

}else if ((t-k)==2) {

p = 12;

}else if ((t-k)==1) {

p = 18;

}else if ((t-k)==0) {

p = 24;

}

//次日凌晨

if (24==p) {

Date time = new Date(date.getTime() + 86400000);

actually.setForecastTime(sdf.format(time)+" 00");

}else if (p<10) {

actually.setForecastTime(sdf.format(date)+" 0"+p);

}else if (p>10&&p!=24) {

actually.setForecastTime(sdf.format(date)+" "+p);

}

list_actually.add(actually);

actually=new ActuallyWeather();

}

i++;

}

}else if (k>t&&j==1) {

forecast = new ActuallyForecastWeather();

for (String ss1 : strsss) {

Matcher mm = proInfo.matcher(ss1);

if (mm.find()) {

//设置产品型号

String[] sss = mm.group(1).trim().replace(" ", "").split(">");

if (i%4==1) {

forecast.setPlaceName(sss[0]);

}else if (i%4==2) {

forecast.setWeatherStatus(sss[0]);

}else if (i%4==3) {

String temp = sss[0];

String[] temps = temp.split("/");

forecast.setMaxTemperature(temps[0]+"℃");

forecast.setMinTemperature(temps[1]);

}else if (i%4==0&&i!=0) {

forecast.setWindPower(sss[0]);

}

}

if (i%4==0&&i!=0) {

Date date = new Date();

//发布日期

forecast.setPubTime(sdf.format(date));

//从第五个模块数据开始数据是从次日凌晨开始每下一组比上一组晚24小时

date = new Date(date.getTime() + (k-t)*86400000);

forecast.setForecastTime(sdf.format(date)+" 00");

list_forecast.add(forecast);

forecast=new ActuallyForecastWeather();

}

i++;

}

}

}

}

}

if (0==j) {

return list_actually;

}else if (1==j) {

return list_forecast;

}

return null;

}

/**

* 生成url

* @param countyCode

* @return

* @throws Exception

*/

public static String getUrl(String countyCode) throws Exception {

String url = "http://3g.nx121.com/pc/tqybxzb.aspx";

if (StringUtils.isNotEmpty(countyCode)) {

url = url + "?sd="+countyCode;

}

return url;

}

/**

* 根据城市编码查询所属市级和区县级行政区

* @param countyCode

* @return

* @throws Exception

*/

public static Map<String, Object> getDistrict(String countyCode) throws Exception {

Map<String, Object> map = new HashMap<String, Object>();

if ("53614".equals(countyCode)) {

//市级行政区

map.put("cityLevel", "银川市");

//区县级行政区

map.put("countyLevel", "市辖区");

}else if ("53610".equals(countyCode)) {

//市级行政区

map.put("cityLevel", "银川市");

//区县级行政区

map.put("countyLevel", "贺兰县");

}else if ("53618".equals(countyCode)) {

//市级行政区

map.put("cityLevel", "银川市");

//区县级行政区

map.put("countyLevel", "永宁县");

}else if ("53619".equals(countyCode)) {

//市级行政区

map.put("cityLevel", "银川市");

//区县级行政区

map.put("countyLevel", "灵武市");

}

return map;

}

public static Pattern getProInfo() {

return proInfo;

}

public static void setProInfo(Pattern proInfo) {

UrlInfo.proInfo = proInfo;

}

public static String getCountyCodes() {

return countyCodes;

}

public static void setCountyCodes(String countyCodes) {

UrlInfo.countyCodes = countyCodes;

}

}

View Code

(3)实体类,用于存放数据的bean

package com.jointsky.jointframe.weather.entity;

import javax.persistence.Column;

import javax.persistence.Entity;

import javax.persistence.Table;

import org.apache.commons.lang.builder.EqualsBuilder;

import org.apache.commons.lang.builder.HashCodeBuilder;

import org.apache.commons.lang.builder.ToStringBuilder;

import org.apache.commons.lang.builder.ToStringStyle;

import org.hibernate.annotations.Cache;

import org.hibernate.annotations.CacheConcurrencyStrategy;

import com.jointsky.jointframe.core.entity.IdEntity;

/**

* <p>Description:实况天气资料(当天)Entity类</p>*/

@Entity

@Table(name = "t_actually_weather")

@Cache(usage = CacheConcurrencyStrategy.READ_WRITE)

public class ActuallyWeather extends IdEntity {

/**

*

*/

private static final long serialVersionUID = -5324072662712469478L;

/**

* 市级行政区(名称)

*/

private String cityLevel;

/**

* 区县级行政区名(名称)

*/

private String countyLevel;

/**

* 发布时间

*/

private String pubTime;

/**

* 地名

*/

private String placeName;

/**

* 天气状态:多云、晴、小雨......

*/

private String weatherStatus;

/**

* 最高温度

*/

private String maxTemperature;

/**

* 最低温度

*/

private String minTemperature;

/**

* 风力

*/

private String windPower;

/**

* 预报时间

*/

private String forecastTime;

@Column(name="pub_time",length=50)

public String getPubTime() {

return pubTime;

}

public void setPubTime(String pubTime) {

this.pubTime = pubTime;

}

@Column(name="place_name",length=50)

public String getPlaceName() {

return placeName;

}

public void setPlaceName(String placeName) {

this.placeName = placeName;

}

@Column(name="weather_status",length=50)

public String getWeatherStatus() {

return weatherStatus;

}

public void setWeatherStatus(String weatherStatus) {

this.weatherStatus = weatherStatus;

}

@Column(name="max_temperature",length=50)

public String getMaxTemperature() {

return maxTemperature;

}

public void setMaxTemperature(String maxTemperature) {

this.maxTemperature = maxTemperature;

}

@Column(name="min_temperature",length=50)

public String getMinTemperature() {

return minTemperature;

}

public void setMinTemperature(String minTemperature) {

this.minTemperature = minTemperature;

}

@Column(name="wind_power",length=50)

public String getWindPower() {

return windPower;

}

public void setWindPower(String windPower) {

this.windPower = windPower;

}

public static long getSerialversionuid() {

return serialVersionUID;

}

@Column(name="forecast_time",length=50)

public String getForecastTime() {

return forecastTime;

}

public void setForecastTime(String forecastTime) {

this.forecastTime = forecastTime;

}

@Override

public String toString() {

return new ToStringBuilder(this, ToStringStyle.MULTI_LINE_STYLE)

.append("id", id).toString();

}

@Column(name="city_level",length=50)

public String getCityLevel() {

return cityLevel;

}

public void setCityLevel(String cityLevel) {

this.cityLevel = cityLevel;

}

@Column(name="county_level",length=50)

public String getCountyLevel() {

return countyLevel;

}

public void setCountyLevel(String countyLevel) {

this.countyLevel = countyLevel;

}

//MeetingArrangement是当前实体

@Override

public boolean equals(Object o) {

boolean equal = false;

if (o != null && ActuallyWeather.class.isAssignableFrom(o.getClass())) {

ActuallyWeather actuallyWeather = (ActuallyWeather) o;

equal = (new EqualsBuilder().append(this.id, actuallyWeather.id)).isEquals();

}

return equal;

}

@Override

public int hashCode() {

return new HashCodeBuilder(17, 37).append(id).toHashCode();

}

}

View Code

预报实体类和当天的字段完全一致,不过表名不一致。

参考文档:http://www.cnblogs.com/shuilangyizu/p/6595588.html

以上是 java网页数据抓取实例 的全部内容, 来源链接: utcz.com/z/392854.html

回到顶部