java实现网页结构分析列表发现

java

原文出处:http://www.yund.tech/zdetail.html?type=1&id=ee06002e2b83e7677c30aedc52d3429e    

作者:jstarseven 


  

现在的网站千奇百怪,什么样格式的都有,需要提取网页中的列表数据,有时候挨个分析处理很头疼,本文是一个页面结构分析的程序,可以分析处理页面大致列表结构。

废话不多说,我也不会说,show me code,code is terrible,so what  hahaha。-------jstarseven

1.抽取元素dom结构框架

 1     /**

2 * 分析元素dom结构框架

3 *

4 * @param node

5 * @return

6 */

7 public String filterHtml(Element node) {

8 //去除节点的属性值

9 Document new_node = Jsoup.parse(node.outerHtml());

10 Elements elements = new_node.getAllElements();

11 for (Element item : elements) {

12 Attributes attributes = item.attributes();

13 for (Attribute a : attributes) {

14 if (a.getKey().equals(KeysEnum.attr_scroce)) {

15 item.removeAttr(a.getKey());

16 continue;

17 }

18 a.setValue(StringUtils.EMPTY);

19 }

20 }

21 //去除注释节点,节点文本内容

22 String str_new = new_node.outerHtml().replaceAll("<!--?(.*?)-->", "");

23 str_new = str_new.replaceAll("\\s*", "");

24 str_new = str_new.replaceAll(">(.*?)<", "><");

25 return str_new;

26 }

2.采用动态规划处理两个字符串相似度

 1 public class SimilarDegree {

2

3 public static final double degree = 0.8;

4

5 /**

6 * 采用动态规划的方法解决

7 *

8 * @param source

9 * @param target

10 * @return

11 */

12 public static int EditDistance(String source, String target) {

13 char[] sources = source.toCharArray();

14 char[] targets = target.toCharArray();

15 int sourceLen = sources.length;

16 int targetLen = targets.length;

17 int[][] d = new int[sourceLen + 1][targetLen + 1];

18 for (int i = 0; i <= sourceLen; i++) {

19 d[i][0] = i;

20 }

21 for (int i = 0; i <= targetLen; i++) {

22 d[0][i] = i;

23 }

24

25 for (int i = 1; i <= sourceLen; i++) {

26 for (int j = 1; j <= targetLen; j++) {

27 if (sources[i - 1] == targets[j - 1]) {

28 d[i][j] = d[i - 1][j - 1];

29 } else {

30 //插入

31 int insert = d[i][j - 1] + 1;

32 //删除

33 int delete = d[i - 1][j] + 1;

34 //替换

35 int replace = d[i - 1][j - 1] + 1;

36 d[i][j] = Math.min(insert, delete) > Math.min(delete, replace) ? Math.min(delete, replace) :

37 Math.min(insert, delete);

38 }

39 }

40 }

41 return d[sourceLen][targetLen];

42 }

43

44 public static void main(String[] args) {

45 System.out.println(EditDistance("html > body > ul > li.proiect_item:nth-child(1) > div.item_row.item_row_title > div:nth-child(1) > a",

46 "html > body > ul > li.proiect_item:nth-child(2) > div.item_row.item_row_title > div:nth-child(1) > a"));

47 }

48

49 }

View Code

3.对网页中每个节点的一级孩子节点分类

 1 /**

2 * 统计列表下各个一级节点类型及个数

3 *

4 * @param node

5 * @return

6 */

7 private Map<String, Integer> getGroupNode(Element node) {

8 Map<String, Integer> map = new HashMap<String, Integer>();

9 Elements children = node.children();

10 for (Element item : children) {

11 if (KeysEnum.input.equalsIgnoreCase(item.tagName()) || KeysEnum.br.equalsIgnoreCase(item.tagName())

12 || KeysEnum.script.equalsIgnoreCase(item.tagName()) || KeysEnum.link.equalsIgnoreCase(item.tagName())

13 || KeysEnum.style.equalsIgnoreCase(item.tagName()) || KeysEnum.meta.equalsIgnoreCase(item.tagName())

14 || KeysEnum.select.equalsIgnoreCase(item.tagName()) || KeysEnum.option.equalsIgnoreCase(item.tagName())

15 || KeysEnum.video.equals(item.tagName()) || KeysEnum.audio.equals(item.tagName())

16 || KeysEnum.textarea.equals(item.tagName())) continue;

17 String key = filterHtml(item);

18 if (map.containsKey(key)) {

19 map.put(key, (Integer) map.get(key) + 1);

20 } else {

21 boolean is_like = false;

22 for (String map_key : map.keySet()) {

23 int dis = SimilarDegree.EditDistance(key, (String) map_key);

24 float v = (float) (key.length() - dis) / key.length();

25 if (v > SimilarDegree.degree) {

26 map.put(map_key, (Integer) map.get(map_key) + 1);

27 is_like = true;

28 break;

29 }

30 }

31 if (!is_like) map.put(key, 1);

32 }

33 }

34 return map;

35 }

4.处理网页中每个元素的叶子节点

 1 /**

2 * 获取叶子节点选择器

3 *

4 * @param node

5 */

6 public static List<String> getYeziNodeSel(Element node) {

7 List<String> list = new ArrayList<String>();

8 Elements all = node.getAllElements();

9 for (Element item : all) {

10 if (item.children().isEmpty()) list.add(item.cssSelector());

11 }

12 return list;

13 }

5.时间提取工具类

  1 /**

2 * jstarseven

3 * 通用时间处理类 return Date

4 * */

5 public class DateParser {

6 private static int timezone = 0;

7 private static final Pattern[] DPTN = {

8

9 Pattern.compile(

10 "(\\d{1,2})[\\s\\-\\/](\\d{1,2})[\\s\\-\\/](20\\d{2})\\s{0,2}((\\d{1,2})[:\\s](\\d{1,2})[:\\s]?(\\d{1,2})?)?"),

11

12 Pattern.compile(

13 "((20)?\\d{2}) {0,2}[\\.\\-/年] {0,2}(\\d{1,2}) {0,2}[\\.\\-/月] {0,2}(\\d{1,2}) {0,2}[日 \\s]{0,2}((上午)|(下午))?\\s{0,2}((\\d{1,2})[:\\s时](\\d{1,2})[:\\s分]?(\\d{1,2})?)?"),

14

15 Pattern.compile("((20)?\\d{2})/(\\d{2})(\\d{2})"),

16

17 Pattern.compile(

18 "(\\d{1,2})[\\.\\-\\s/月](\\d{1,2})[日\\s]{0,2}((上午)|(下午))?\\s{0,2}((\\d{1,2})[:\\s](\\d{1,2})[:\\s]?(\\d{1,2})?)?"),

19

20 Pattern.compile("([今前昨]天)?\\s{0,4}(\\d{1,2})[:\\s]{1,3}(\\d{1,2})[:\\s]?(\\d{1,2})?"),

21

22 Pattern.compile("[今前昨]天"),

23

24 Pattern.compile("((\\d{1,2})|(半))\\s*个?([天秒小时分钟周月年]{1,2})前"),

25

26 Pattern.compile("(\\d{1,2})小?时(\\d{1,2})分钟?前"),

27

28 Pattern.compile("(20\\d{2})[01]?(\\d{2})[012]?(\\d{2})") };

29

30 public static Date parse(Object obj) {

31 if (obj == null) {

32 return null;

33 }

34 if ((obj instanceof Date)) {

35 return (Date) obj;

36 }

37 if ((obj instanceof Number)) {

38 return new Date(((Number) obj).longValue());

39 }

40 String str = ((String) obj).trim();

41 if ((str.length() == 0) || ("null".equalsIgnoreCase(str))) {

42 return null;

43 }

44 str = transZH(str);

45 Calendar c = Calendar.getInstance();

46 c.setTimeInMillis(System.currentTimeMillis());

47

48 Matcher mt = DPTN[0].matcher(str);

49 if (mt.find()) {

50 int date = Integer.parseInt(mt.group(2));

51 if ((date == 0) || (date > 31)) {

52 return null;

53 }

54 int month = Integer.parseInt(mt.group(1));

55 if (month <= 0) {

56 return null;

57 }

58 if (month > 12) {

59 if ((date > 0) && (date <= 12) && (month < 32)) {

60 int tmp = month;

61 month = date;

62 date = tmp;

63 } else {

64 return null;

65 }

66 }

67 String sy = mt.group(3);

68 int year = Integer.parseInt(sy);

69 if ((year < 2000) || (year > 2099)) {

70 return null;

71 }

72 String hms = mt.group(4);

73 if ((hms == null) || (hms.length() == 0)) {

74 c.set(year, month - 1, date, timezone > 0 ? timezone : 0, 0, 0);

75 return c.getTime();

76 }

77 int hour = Integer.parseInt(mt.group(5));

78 if (hour >= 24) {

79 return null;

80 }

81 int min = Integer.parseInt(mt.group(6));

82 if (min >= 60) {

83 return null;

84 }

85 String ssec = mt.group(7);

86 int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);

87 c.set(year, month - 1, date, hour, min, sec);

88 return c.getTime();

89 }

90 mt = DPTN[1].matcher(str);

91 if (mt.find()) {

92 String sy = mt.group(1);

93 if (sy.length() == 2) {

94 sy = "20" + sy;

95 }

96 int year = Integer.parseInt(sy);

97 if ((year < 2000) || (year > 2099)) {

98 return null;

99 }

100 int month = Integer.parseInt(mt.group(3)) - 1;

101 if ((month < 0) || (month > 11)) {

102 return null;

103 }

104 int date = Integer.parseInt(mt.group(4));

105 if (date > 31) {

106 return null;

107 }

108 String ss = mt.group(8);

109 if ((ss == null) || (ss.length() == 0)) {

110 c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);

111 return c.getTime();

112 }

113 int hour = Integer.parseInt(mt.group(9));

114 if (hour >= 24) {

115 return null;

116 }

117 int min = Integer.parseInt(mt.group(10));

118 if (min >= 60) {

119 return null;

120 }

121 String ssec = mt.group(11);

122 int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);

123 if (("下午".equals(mt.group(5))) && (hour < 12)) {

124 hour += 12;

125 }

126 c.set(year, month, date, hour, min, sec);

127 return c.getTime();

128 }

129 mt = DPTN[2].matcher(str);

130 if (mt.find()) {

131 String strYear = mt.group(1);

132 if (!strYear.startsWith("20")) {

133 strYear = "20" + strYear;

134 }

135 int year = Integer.parseInt(strYear);

136 int month = Integer.parseInt(mt.group(3)) - 1;

137 int day = Integer.parseInt(mt.group(4));

138 c.set(year, month, day, 0, 0, 0);

139 return c.getTime();

140 }

141 mt = DPTN[3].matcher(str);

142 if (mt.find()) {

143 int year = c.get(1);

144 int month = Integer.parseInt(mt.group(1)) - 1;

145 if (month < 0) {

146 return null;

147 }

148 if (month > c.get(2)) {

149 year--;

150 }

151 int date = Integer.parseInt(mt.group(2));

152 if (date > 31) {

153 return null;

154 }

155 String p = mt.group(6);

156 if ((p == null) || (p.length() == 0)) {

157 c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);

158 return c.getTime();

159 }

160 int hour = Integer.parseInt(mt.group(7));

161 if (hour >= 24) {

162 return null;

163 }

164 int min = Integer.parseInt(mt.group(8));

165 if (min >= 60) {

166 return null;

167 }

168 String ssec = mt.group(9);

169 int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);

170 if (("下午".equals(mt.group(3))) && (hour < 12)) {

171 hour += 12;

172 }

173 c.set(year, month, date, hour, min, sec);

174 return c.getTime();

175 }

176 mt = DPTN[4].matcher(str);

177 if (mt.find()) {

178 int hour = Integer.parseInt(mt.group(2));

179 if (hour >= 24) {

180 return null;

181 }

182 int min = Integer.parseInt(mt.group(3));

183 if (min >= 60) {

184 return null;

185 }

186 String day = mt.group(1);

187 if ("昨天".equals(day)) {

188 c.add(5, -1);

189 } else if ("前天".equals(day)) {

190 c.add(5, -2);

191 }

192 c.set(11, hour);

193 c.set(12, min);

194 return c.getTime();

195 }

196 mt = DPTN[5].matcher(str);

197 if (mt.find()) {

198 String day = mt.group(0);

199 if ("昨天".equals(day)) {

200 c.add(5, -1);

201 } else if ("前天".equals(day)) {

202 c.add(5, -2);

203 }

204 return c.getTime();

205 }

206 mt = DPTN[6].matcher(str);

207 if (mt.find()) {

208 String s = mt.group(4);

209 long t;

210 if ("年".equals(s)) {

211 t = 31536000000L;

212 } else {

213 if ("月".equals(s)) {

214 t = 2592000000L;

215 } else {

216 if ("周".equals(s)) {

217 t = 604800000L;

218 } else {

219 if ("天".equals(s)) {

220 t = 86400000L;

221 } else {

222 if ("小时".equals(s)) {

223 t = 3600000L;

224 } else {

225 if ("时".equals(s)) {

226 t = 3600000L;

227 } else {

228 if ("分钟".equals(s)) {

229 t = 60000L;

230 } else {

231 if ("分".equals(s)) {

232 t = 60000L;

233 } else {

234 if ("秒".equals(s)) {

235 t = 1000L;

236 } else {

237 return null;

238 }

239 }

240 }

241 }

242 }

243 }

244 }

245 }

246 }

247 String vs = mt.group(1);

248 if ("半".equals(vs)) {

249 t = System.currentTimeMillis() - t / 2L;

250 } else {

251 t = System.currentTimeMillis() - Integer.parseInt(vs) * t;

252 }

253 return new Date(t);

254 }

255 mt = DPTN[7].matcher(str);

256 if (mt.find()) {

257 int hh = Integer.parseInt(mt.group(1));

258 int nn = Integer.parseInt(mt.group(2));

259 long t = 3600000 * hh + 60000 * nn;

260 return new Date(System.currentTimeMillis() - t);

261 }

262 mt = DPTN[8].matcher(str);

263 if (mt.find()) {

264 String sy = mt.group(1);

265 int year = Integer.parseInt(sy);

266 if ((year < 2000) || (year > 2099)) {

267 return null;

268 }

269 int month = Integer.parseInt(mt.group(2)) - 1;

270 if ((month < 0) || (month > 11)) {

271 return null;

272 }

273 int date = Integer.parseInt(mt.group(3));

274 if (date > 31) {

275 return null;

276 }

277 c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);

278 return c.getTime();

279 }

280 return null;

281 }

282

283 private static String transZH(String string) {

284 String zh = "〇一二三四五六七八九";

285 string = string.replace("整", "0分").replaceAll("[上下]午", "");

286 StringBuffer buffer = new StringBuffer();

287 for (Character Char : string.toCharArray()) {

288 int index = zh.indexOf(Char);

289 if (index >= 0) {

290 buffer.append(index);

291 } else {

292 buffer.append(Char);

293 }

294 }

295 String str = buffer.toString();

296 int index = str.indexOf("十");

297 if (index == -1) {

298 return str;

299 } else {

300 if (!Character.isDigit(str.charAt(index-1)) && !Character.isDigit(str.charAt(index+1))) {

301 str=str.replace("十", "10");

302 }else if (Character.isDigit(str.charAt(index-1)) && !Character.isDigit(str.charAt(index+1))) {

303 str=str.replace("十", "0");

304 }else if(!Character.isDigit(str.charAt(index-1)) && Character.isDigit(str.charAt(index+1))){

305 str=str.replace("十", "1");

306 }else if(Character.isDigit(str.charAt(index-1)) && Character.isDigit(str.charAt(index+1))){

307 str=str.replace("十", "");

308 }

309 return str;

310 }

311

312 }

313

314 public static void main(String[] args) {

315 System.out.println(parse("1982-01-01 00:00:00"));

316 System.out.println(transZH("二〇一七年九月十日 上午十时整"));

317 System.out.println(transZH("二〇一七年九月二十日 上午九时整"));

318 System.out.println(transZH("二〇一七年九月十九日 上午九时整"));

319 System.out.println(transZH("二〇一七年九月二十三日 上午九时整"));

320 System.out.println("timezone=" + timezone);

321 String[] testdata = { "1982-01-01 00:00:00","11-13 15:24", "2009-8-30 16:42:10", "8-23 15:24", "2周前", "3 天前", "12 分钟前", "3天前",

322 "前天 09:36", "昨天 09:21 ", "2010-12-17 00:23 ", "2010-12-17 ", "昨天 12:37 ", "2011-8-15 08:42",

323 "25-7-2011 11:43:57", "1-9-2011", "06-03", "半小时前", "今天发表", "昨天发表", "前天发表", "06-03-2010",

324 "02-01-2010 00:39", "3小时26分钟前", "2010-8-24 上午 01:17:32", "2010-8-24 下午 01:17:32", "7小时前 »",

325 "4/29/2010 1:31:00", "2012 年 1 月 31 日", "17时20分前", "2017年10月12日 14时30分", "二〇一七年九月十九日 上午九时整" };

326

327 DateFormat df = DateFormat.getDateTimeInstance(2, 2);

328 for (String s : testdata) {

329 Date d = parse(s);

330 System.out.println(s + "\t\t" + (d == null ? d : df.format(d)));

331 }

332 }

333

334 }

View Code

6.自定义比较器对网页所有元素排序,发现结果靠前的基本都是列表元素

  比较器:按照疑似列表的可能性

 1 /**

2 * 排序子节点

3 * 1.最大相同dom结构长度

4 * 2.最大相同dom结构元素数量

5 *

6 * @param nodes

7 * @return

8 */

9 private Elements sortBy(Elements nodes, String base_url) {

10 // System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");

11 nodes.sort(new Comparator<Element>() {

12 @Override

13 public int compare(Element o1, Element o2) {

14 double o1_rate = reckonRate(o1);

15 double o2_rate = reckonRate(o2);

16 return (o2_rate > o1_rate) ? 1 : ((o2_rate == o1_rate) ? 0 : -1);

17 }

18

19 private double reckonRate(Element o) {

20 if (StringUtils.isNotBlank(base_url) && KeysEnum.a.equalsIgnoreCase(o.tagName()) && base_url.equalsIgnoreCase(o.attr(KeysEnum.attr_href)))

21 o.attr(KeysEnum.attr_list_tag_name, o.text());

22 if (null == o || o.children().size() < 2

23 || KeysEnum.html.equalsIgnoreCase(o.tagName()) || KeysEnum.body.equalsIgnoreCase(o.tagName()) || KeysEnum.link.equalsIgnoreCase(o.tagName())

24 || KeysEnum.head.equalsIgnoreCase(o.tagName()) || KeysEnum.title.equalsIgnoreCase(o.tagName()) || KeysEnum.meta.equalsIgnoreCase(o.tagName())

25 || KeysEnum.script.equalsIgnoreCase(o.tagName()) || KeysEnum.style.equalsIgnoreCase(o.tagName())) {

26 o.attr(KeysEnum.attr_scroce, "0");

27 return 0;

28 }

29 String style = o.attr(KeysEnum.style);

30 if (StringUtils.isNotBlank(style) && style.contains(KeysEnum.display_none)) {

31 o.attr(KeysEnum.attr_scroce, "0");

32 return 0;

33 }

34 Map<String, Object> maxKeyDom = getMaxKeyDom(o);

35 String key = (String) maxKeyDom.get(KeysEnum.max_key);

36 int num = (int) maxKeyDom.get(KeysEnum.max_num);

37 if (num < 2) {

38 o.attr(KeysEnum.attr_scroce, "0");

39 return 0;

40 }

41 int scroce = num * key.length();

42 Elements tags = o.children();

43 for (Element a : tags) {

44 if (KeysEnum.div.equalsIgnoreCase(a.tagName())) scroce += 5;

45 if (KeysEnum.ul.equalsIgnoreCase(a.tagName())) scroce += 10;

46 if (KeysEnum.li.equalsIgnoreCase(a.tagName())) scroce += 10;

47 if (KeysEnum.tbody.equalsIgnoreCase(a.tagName())) scroce += 5;

48 if (KeysEnum.table.equalsIgnoreCase(a.tagName())) scroce += 5;

49 if (KeysEnum.tr.equalsIgnoreCase(a.tagName())) scroce += 10;

50 if (KeysEnum.td.equalsIgnoreCase(a.tagName())) scroce += 1;

51 if (KeysEnum.a.equalsIgnoreCase(a.tagName())) scroce += 1;

52 if (KeysEnum.p.equalsIgnoreCase(a.tagName())) scroce += 1;

53 try {

54 Date time = DateParser.parse(a.text());

55 if (null != time) scroce += 20;

56 } catch (Exception e) {

57 }

58 }

59 if (o.text().contains(KeysEnum.next_page)) scroce += 100;

60 if (o.text().contains(KeysEnum.start_page) || o.text().contains(KeysEnum.fisrt_page)) scroce += 100;

61 if (o.text().contains(KeysEnum.end_page) || o.text().contains(KeysEnum.last_page) || o.text().contains(KeysEnum.final_page))

62 scroce += 100;

63 o.attr(KeysEnum.attr_scroce, String.valueOf(scroce));

64 return scroce;

65 }

66 });

67 return nodes;

68 }

7.处理页面html,调用列表分析返回json结果

  1  /**

2 * 提取页面列表元素的选择器以及页面分类标签

3 *

4 * @param document

5 * @param is_subitem

6 * @return

7 */

8 public static Map<String, Object> dealListNode(Document document, boolean is_subitem) throws Exception {

9 Map<String, Object> result = new HashMap<String, Object>();

10 try {

11 ListAutoFire listAutoFire = new ListAutoFire();

12 Elements list_node = listAutoFire.autoFireListNodes(document);

13 List<Map<String, Object>> lists = new ArrayList();

14 if (null != list_node && list_node.size() > 0) {

15 for (Element list_sel_item : list_node) {

16 if (list_sel_item.hasAttr(KeysEnum.attr_list_tag_name) && StringUtils.isNotBlank(list_sel_item.attr(KeysEnum.attr_list_tag_name))) {

17 result.put(KeysEnum.tag_name, list_sel_item.attr(KeysEnum.attr_list_tag_name));

18 continue;

19 }

20 Map<String, Object> list_dom_frame = new HashMap<>();

21 list_dom_frame.put(KeysEnum.list_sel, list_sel_item.cssSelector());

22 if (is_subitem) {

23 Map<String, List<String>> listItem = new HashMap<String, List<String>>();

24 for (Element item : list_sel_item.children())

25 listItem.put(item.cssSelector(), getYeziNodeSel(item));

26 list_dom_frame.put(KeysEnum.list_dom, listItem);

27 }

28 list_dom_frame.put(KeysEnum.attr_scroce, list_sel_item.attr(KeysEnum.attr_scroce));

29 lists.add(list_dom_frame);

30 }

31 }

32 result.put(KeysEnum.list, lists);

33 } catch (Exception e) {

34 throw new Exception(KeysEnum.error_info, e.getCause());

35 }

36 return result;

37 }

38

39 /**

40 * 处理网页结构

41 *

42 * @param home_url 入口地址

43 * @param list_index 列表元素获取数量

44 * @param is_subitem 是否处理列表元素子项抽取 true/false

45 * @param is_ifr 是否处理iframe true/false

46 * @return

47 */

48 public static Map<String, Object> getWebSiteFrame(String home_url, int list_index, boolean is_subitem, boolean is_ifr) {

49 Map<String, Object> result = new HashMap<String, Object>();

50 if (StringUtils.isBlank(home_url)) return result;

51 try {

52 Document html = Jsoup.connect(home_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();

53 if (null == html) throw new Exception(KeysEnum.open_fail);

54 Map<String, Object> mapNode = dealListNode(html, is_subitem);

55 List listNode = (List) mapNode.get(KeysEnum.list);

56 result.put(KeysEnum.home_url, home_url);

57 result.put(KeysEnum.tag_name, mapNode.get(KeysEnum.tag_name));

58 result.put(KeysEnum.list, listNode.subList(0, listNode.size() > list_index ? list_index : listNode.size()));

59 result.put(KeysEnum.ifrs, new ArrayList());

60 if (is_ifr) {

61 List<Map<String, Object>> ifrs = (List<Map<String, Object>>) result.get(KeysEnum.ifrs);

62 Elements iframe_nodes = html.getElementsByTag(KeysEnum.iframe);

63 if (null != iframe_nodes) {

64 for (Element iframe : iframe_nodes) {

65 String iframe_url = iframe.attr(KeysEnum.attr_src);

66 if (StringUtils.isBlank(iframe_url)) continue;

67 try {

68 Document iframe_html = Jsoup.connect(iframe_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();

69 if (null == iframe_html) continue;

70 Map<String, Object> ifrMapNode = dealListNode(iframe_html, is_subitem);

71 List ifrListNode = (List) ifrMapNode.get(KeysEnum.list);

72 Map<String, Object> ifr_map = new HashMap();

73 ifr_map.put(KeysEnum.home_url, iframe_url);

74 ifr_map.put(KeysEnum.tag_name, ifrMapNode.get(KeysEnum.tag_name));

75 ifr_map.put(KeysEnum.list, ifrListNode.subList(0, ifrListNode.size() > list_index ? list_index : ifrListNode.size()));

76 ifrs.add(ifr_map);

77 } catch (Exception e) {

78 e.printStackTrace();

79 }

80 }

81 }

82 }

83 } catch (Exception e) {

84 e.printStackTrace();

85 result.clear();

86 result.put(KeysEnum.home_url, home_url);

87 result.put(KeysEnum.error, KeysEnum.error_info);

88 result.put(KeysEnum.message, e.toString());

89 }

90 return result;

91 }

92

93 /**

94 * 处理网页结构

95 *

96 * @param home_url 入口地址

97 * @param list_index 列表元素获取数量

98 * @param is_subitem 是否处理列表元素子项抽取 true/false

99 * @return

100 */

101 public static Map<String, Object> getWebSiteFrame(String home_url, int list_index, boolean is_subitem) {

102 return getWebSiteFrame(home_url, list_index, is_subitem, false);

103 }

104

105 /**

106 * 处理网页结构

107 *

108 * @param home_url 入口地址

109 * @param list_index 列表元素获取数量

110 * @return

111 */

112 public static Map<String, Object> getWebSiteFrame(String home_url, int list_index) {

113 return getWebSiteFrame(home_url, list_index, false);

114 }

115

116 /**

117 * 处理网页结构

118 *

119 * @param home_url 入口地址

120 * @return

121 */

122 public static Map<String, Object> getWebSiteFrame(String home_url) {

123 return getWebSiteFrame(home_url, 10);

124 }

View Code

8.生成页面分析结果标记文件

 1 public static void createMarkFile(Map siteFrame, String home_url, String path) {

2 try {

3 Document doc = Jsoup.connect(home_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();

4 if (null == doc) return;

5 String style = ".mark_color {" +

6 "position:relative;" +

7 "pointer-events:none;" +

8 "left:0px;top:0px;" +

9 "display:inline-block;" +

10 "margin:-2px;width:100%;" +

11 "height:100%;" +

12 "border:dashed 2px #FF69B4;" +

13 "background-color: #43CD80;" +

14 "opacity:0.75;" +

15 "} " ;

16 List list = (List) siteFrame.get("list");

17 for (Object item : list) {

18 Map item_map = (Map) item;

19 String sel = (String) item_map.get("list_sel");

20 doc.select(sel).addClass("mark_color");

21 }

22 String content = doc.html();

23 content = content.contains("<base") ? content : content.replaceFirst("<head", "<base href='" + home_url + "'/><style>" + style + "</style><head");

24 FileUtils.writeStringToFile(new File(path), content, "UTF-8", false);

25

26 } catch (IOException e) {

27 e.printStackTrace();

28 }

29 }

View Code

9.上述第7步返回的结果实例:

拿cnblog首页做测试,返回结果:

字段解释:

home_url :分析的页面地址

tag_name :当前页面的类型,多数情况下不正确,我只是拿home_url和页面的url比对,取了对应的text

list:页面中疑似列表元素

      list_sel:页面中疑似列表元素的选择器

      list_dom:页面中疑似列表元素的 一级孩子节点元素,叶子元素选择器

ifrs:页面中包含iframe分析的结果,没有则为空

  1 {

2 "home_url": "https://www.cnblogs.com/",

3 "tag_name": "1",

4 "list": [

5 {

6 "list_sel": "#post_list",

7 "list_dom": {

8 "#post_list > div.post_item:nth-child(7)": [

9 "#digg_count_9500831",

10 "#post_list > div.post_item:nth-child(7) > div.digg > div.clear",

11 "#digg_tip_9500831",

12 "#post_list > div.post_item:nth-child(7) > div.post_item_body > h3 > a.titlelnk",

13 "#post_list > div.post_item:nth-child(7) > div.post_item_body > p.post_item_summary",

14 "#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > a.lightblue",

15 "#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

16 "#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

17 "#post_list > div.post_item:nth-child(7) > div.clear"

18 ],

19 "#post_list > div.post_item:nth-child(19)": [

20 "#digg_count_9499348",

21 "#post_list > div.post_item:nth-child(19) > div.digg > div.clear",

22 "#digg_tip_9499348",

23 "#post_list > div.post_item:nth-child(19) > div.post_item_body > h3 > a.titlelnk",

24 "#post_list > div.post_item:nth-child(19) > div.post_item_body > p.post_item_summary > a > img.pfs",

25 "#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > a.lightblue",

26 "#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

27 "#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

28 "#post_list > div.post_item:nth-child(19) > div.clear"

29 ],

30 "#post_list > div.post_item:nth-child(6)": [

31 "#digg_count_9500833",

32 "#post_list > div.post_item:nth-child(6) > div.digg > div.clear",

33 "#digg_tip_9500833",

34 "#post_list > div.post_item:nth-child(6) > div.post_item_body > h3 > a.titlelnk",

35 "#post_list > div.post_item:nth-child(6) > div.post_item_body > p.post_item_summary",

36 "#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > a.lightblue",

37 "#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

38 "#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

39 "#post_list > div.post_item:nth-child(6) > div.clear"

40 ],

41 "#post_list > div.post_item:nth-child(9)": [

42 "#digg_count_9500757",

43 "#post_list > div.post_item:nth-child(9) > div.digg > div.clear",

44 "#digg_tip_9500757",

45 "#post_list > div.post_item:nth-child(9) > div.post_item_body > h3 > a.titlelnk",

46 "#post_list > div.post_item:nth-child(9) > div.post_item_body > p.post_item_summary > a > img.pfs",

47 "#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > a.lightblue",

48 "#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

49 "#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

50 "#post_list > div.post_item:nth-child(9) > div.clear"

51 ],

52 "#post_list > div.post_item:nth-child(17)": [

53 "#digg_count_9495616",

54 "#post_list > div.post_item:nth-child(17) > div.digg > div.clear",

55 "#digg_tip_9495616",

56 "#post_list > div.post_item:nth-child(17) > div.post_item_body > h3 > a.titlelnk",

57 "#post_list > div.post_item:nth-child(17) > div.post_item_body > p.post_item_summary > a > img.pfs",

58 "#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > a.lightblue",

59 "#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

60 "#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

61 "#post_list > div.post_item:nth-child(17) > div.clear"

62 ],

63 "#post_list > div.post_item:nth-child(8)": [

64 "#digg_count_9500822",

65 "#post_list > div.post_item:nth-child(8) > div.digg > div.clear",

66 "#digg_tip_9500822",

67 "#post_list > div.post_item:nth-child(8) > div.post_item_body > h3 > a.titlelnk",

68 "#post_list > div.post_item:nth-child(8) > div.post_item_body > p.post_item_summary > a > img.pfs",

69 "#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > a.lightblue",

70 "#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

71 "#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

72 "#post_list > div.post_item:nth-child(8) > div.clear"

73 ],

74 "#post_list > div.post_item:nth-child(18)": [

75 "#digg_count_9499454",

76 "#post_list > div.post_item:nth-child(18) > div.digg > div.clear",

77 "#digg_tip_9499454",

78 "#post_list > div.post_item:nth-child(18) > div.post_item_body > h3 > a.titlelnk",

79 "#post_list > div.post_item:nth-child(18) > div.post_item_body > p.post_item_summary > a > img.pfs",

80 "#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > a.lightblue",

81 "#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

82 "#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

83 "#post_list > div.post_item:nth-child(18) > div.clear"

84 ],

85 "#post_list > div.post_item:nth-child(3)": [

86 "#digg_count_9500944",

87 "#post_list > div.post_item:nth-child(3) > div.digg > div.clear",

88 "#digg_tip_9500944",

89 "#post_list > div.post_item:nth-child(3) > div.post_item_body > h3 > a.titlelnk",

90 "#post_list > div.post_item:nth-child(3) > div.post_item_body > p.post_item_summary > a > img.pfs",

91 "#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > a.lightblue",

92 "#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

93 "#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

94 "#post_list > div.post_item:nth-child(3) > div.clear"

95 ],

96 "#post_list > div.post_item:nth-child(2)": [

97 "#digg_count_9500357",

98 "#post_list > div.post_item:nth-child(2) > div.digg > div.clear",

99 "#digg_tip_9500357",

100 "#post_list > div.post_item:nth-child(2) > div.post_item_body > h3 > a.titlelnk",

101 "#post_list > div.post_item:nth-child(2) > div.post_item_body > p.post_item_summary",

102 "#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > a.lightblue",

103 "#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

104 "#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

105 "#post_list > div.post_item:nth-child(2) > div.clear"

106 ],

107 "#post_list > div.post_item:nth-child(5)": [

108 "#digg_count_9500890",

109 "#post_list > div.post_item:nth-child(5) > div.digg > div.clear",

110 "#digg_tip_9500890",

111 "#post_list > div.post_item:nth-child(5) > div.post_item_body > h3 > a.titlelnk",

112 "#post_list > div.post_item:nth-child(5) > div.post_item_body > p.post_item_summary > a > img.pfs",

113 "#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > a.lightblue",

114 "#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

115 "#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

116 "#post_list > div.post_item:nth-child(5) > div.clear"

117 ],

118 "#post_list > div.post_item:nth-child(4)": [

119 "#digg_count_9500935",

120 "#post_list > div.post_item:nth-child(4) > div.digg > div.clear",

121 "#digg_tip_9500935",

122 "#post_list > div.post_item:nth-child(4) > div.post_item_body > h3 > a.titlelnk",

123 "#post_list > div.post_item:nth-child(4) > div.post_item_body > p.post_item_summary > a > img.pfs",

124 "#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > a.lightblue",

125 "#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

126 "#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

127 "#post_list > div.post_item:nth-child(4) > div.clear"

128 ],

129 "#post_list > div.post_item:nth-child(1)": [

130 "#digg_count_9501071",

131 "#post_list > div.post_item:nth-child(1) > div.digg > div.clear",

132 "#digg_tip_9501071",

133 "#post_list > div.post_item:nth-child(1) > div.post_item_body > h3 > a.titlelnk",

134 "#post_list > div.post_item:nth-child(1) > div.post_item_body > p.post_item_summary > a > img.pfs",

135 "#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > a.lightblue",

136 "#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

137 "#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

138 "#post_list > div.post_item:nth-child(1) > div.clear"

139 ],

140 "#post_list > div.post_item:nth-child(15)": [

141 "#digg_count_9403762",

142 "#post_list > div.post_item:nth-child(15) > div.digg > div.clear",

143 "#digg_tip_9403762",

144 "#post_list > div.post_item:nth-child(15) > div.post_item_body > h3 > a.titlelnk",

145 "#post_list > div.post_item:nth-child(15) > div.post_item_body > p.post_item_summary > a > img.pfs",

146 "#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > a.lightblue",

147 "#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

148 "#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

149 "#post_list > div.post_item:nth-child(15) > div.clear"

150 ],

151 "#post_list > div.post_item:nth-child(16)": [

152 "#digg_count_9499534",

153 "#post_list > div.post_item:nth-child(16) > div.digg > div.clear",

154 "#digg_tip_9499534",

155 "#post_list > div.post_item:nth-child(16) > div.post_item_body > h3 > a.titlelnk",

156 "#post_list > div.post_item:nth-child(16) > div.post_item_body > p.post_item_summary > a > img.pfs",

157 "#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > a.lightblue",

158 "#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

159 "#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

160 "#post_list > div.post_item:nth-child(16) > div.clear"

161 ],

162 "#post_list > div.post_item:nth-child(13)": [

163 "#digg_count_9465698",

164 "#post_list > div.post_item:nth-child(13) > div.digg > div.clear",

165 "#digg_tip_9465698",

166 "#post_list > div.post_item:nth-child(13) > div.post_item_body > h3 > a.titlelnk",

167 "#post_list > div.post_item:nth-child(13) > div.post_item_body > p.post_item_summary > a > img.pfs",

168 "#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > a.lightblue",

169 "#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

170 "#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

171 "#post_list > div.post_item:nth-child(13) > div.clear"

172 ],

173 "#post_list > div.post_item:nth-child(14)": [

174 "#digg_count_9498410",

175 "#post_list > div.post_item:nth-child(14) > div.digg > div.clear",

176 "#digg_tip_9498410",

177 "#post_list > div.post_item:nth-child(14) > div.post_item_body > h3 > a.titlelnk",

178 "#post_list > div.post_item:nth-child(14) > div.post_item_body > p.post_item_summary > a > img.pfs",

179 "#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > a.lightblue",

180 "#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

181 "#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

182 "#post_list > div.post_item:nth-child(14) > div.clear"

183 ],

184 "#post_list > div.post_item:nth-child(11)": [

185 "#digg_count_9500633",

186 "#post_list > div.post_item:nth-child(11) > div.digg > div.clear",

187 "#digg_tip_9500633",

188 "#post_list > div.post_item:nth-child(11) > div.post_item_body > h3 > a.titlelnk",

189 "#post_list > div.post_item:nth-child(11) > div.post_item_body > p.post_item_summary",

190 "#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > a.lightblue",

191 "#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

192 "#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

193 "#post_list > div.post_item:nth-child(11) > div.clear"

194 ],

195 "#post_list > div.post_item:nth-child(12)": [

196 "#digg_count_9500352",

197 "#post_list > div.post_item:nth-child(12) > div.digg > div.clear",

198 "#digg_tip_9500352",

199 "#post_list > div.post_item:nth-child(12) > div.post_item_body > h3 > a.titlelnk",

200 "#post_list > div.post_item:nth-child(12) > div.post_item_body > p.post_item_summary > a > img.pfs",

201 "#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > a.lightblue",

202 "#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

203 "#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

204 "#post_list > div.post_item:nth-child(12) > div.clear"

205 ],

206 "#post_list > div.post_item:nth-child(20)": [

207 "#digg_count_9499225",

208 "#post_list > div.post_item:nth-child(20) > div.digg > div.clear",

209 "#digg_tip_9499225",

210 "#post_list > div.post_item:nth-child(20) > div.post_item_body > h3 > a.titlelnk",

211 "#post_list > div.post_item:nth-child(20) > div.post_item_body > p.post_item_summary > a > img.pfs",

212 "#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > a.lightblue",

213 "#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

214 "#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

215 "#post_list > div.post_item:nth-child(20) > div.clear"

216 ],

217 "#post_list > div.post_item:nth-child(10)": [

218 "#digg_count_9500632",

219 "#post_list > div.post_item:nth-child(10) > div.digg > div.clear",

220 "#digg_tip_9500632",

221 "#post_list > div.post_item:nth-child(10) > div.post_item_body > h3 > a.titlelnk",

222 "#post_list > div.post_item:nth-child(10) > div.post_item_body > p.post_item_summary > a > img.pfs",

223 "#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > a.lightblue",

224 "#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",

225 "#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",

226 "#post_list > div.post_item:nth-child(10) > div.clear"

227 ]

228 },

229 "scroce": "9860"

230 },

231 {

232 "list_sel": "#cate_item",

233 "list_dom": {

234 "#cate_item_108705": [

235 "#cate_item_108705 > a"

236 ],

237 "#cate_item_108704": [

238 "#cate_item_108704 > a"

239 ],

240 "#cate_item_108703": [

241 "#cate_item_108703 > a"

242 ],

243 "#cate_item_4": [

244 "#cate_item_4 > a"

245 ],

246 "#cate_item_2": [

247 "#cate_item_2 > a"

248 ],

249 "#cate_item_108709": [

250 "#cate_item_108709 > a"

251 ],

252 "#cate_item_0": [

253 "#cate_item_0 > a"

254 ],

255 "#cate_item_108698": [

256 "#cate_item_108698 > a"

257 ],

258 "#cate_item_108724": [

259 "#cate_item_108724 > a"

260 ],

261 "#cate_item_108701": [

262 "#cate_item_108701 > a"

263 ],

264 "#cate_item_108712": [

265 "#cate_item_108712 > a"

266 ],

267 "#cate_item_-1": [

268 "#cate_item_-1 > a"

269 ]

270 },

271 "scroce": "1248"

272 },

273 {

274 "list_sel": "#friend_link",

275 "list_dom": {

276 "#friend_link > a:nth-child(15)": [

277 "#friend_link > a:nth-child(15)"

278 ],

279 "#friend_link > a:nth-child(16)": [

280 "#friend_link > a:nth-child(16)"

281 ],

282 "#friend_link > a:nth-child(17)": [

283 "#friend_link > a:nth-child(17)"

284 ],

285 "#friend_link > a:nth-child(18)": [

286 "#friend_link > a:nth-child(18)"

287 ],

288 "#friend_link > a:nth-child(1)": [

289 "#friend_link > a:nth-child(1)"

290 ],

291 "#friend_link > a:nth-child(11)": [

292 "#friend_link > a:nth-child(11)"

293 ],

294 "#friend_link > a:nth-child(12)": [

295 "#friend_link > a:nth-child(12)"

296 ],

297 "#friend_link > a:nth-child(3)": [

298 "#friend_link > a:nth-child(3)"

299 ],

300 "#friend_link > a:nth-child(13)": [

301 "#friend_link > a:nth-child(13)"

302 ],

303 "#friend_link > a:nth-child(2)": [

304 "#friend_link > a:nth-child(2)"

305 ],

306 "#friend_link > a:nth-child(14)": [

307 "#friend_link > a:nth-child(14)"

308 ],

309 "#friend_link > a:nth-child(19)": [

310 "#friend_link > a:nth-child(19)"

311 ],

312 "#friend_link > a:nth-child(5)": [

313 "#friend_link > a:nth-child(5)"

314 ],

315 "#friend_link > a:nth-child(4)": [

316 "#friend_link > a:nth-child(4)"

317 ],

318 "#friend_link > a:nth-child(7)": [

319 "#friend_link > a:nth-child(7)"

320 ],

321 "#friend_link > a:nth-child(6)": [

322 "#friend_link > a:nth-child(6)"

323 ],

324 "#friend_link > a:nth-child(10)": [

325 "#friend_link > a:nth-child(10)"

326 ],

327 "#friend_link > a:nth-child(9)": [

328 "#friend_link > a:nth-child(9)"

329 ],

330 "#friend_link > a:nth-child(8)": [

331 "#friend_link > a:nth-child(8)"

332 ]

333 },

334 "scroce": "1197"

335 },

336 {

337 "list_sel": "#side_nav",

338 "list_dom": {

339 "#side_nav > div.w_l:nth-child(16)": [

340 "#side_nav > div.w_l:nth-child(16) > h4",

341 "#site_stats"

342 ],

343 "#side_nav > p.r_l_1:nth-child(7)": [

344 "#side_nav > p.r_l_1:nth-child(7)"

345 ],

346 "#side_nav > p.r_l_2:nth-child(8)": [

347 "#side_nav > p.r_l_2:nth-child(8)"

348 ],

349 "#side_nav > p.r_l_3:nth-child(9)": [

350 "#side_nav > p.r_l_3:nth-child(9)"

351 ],

352 "#side_nav > p.r_l_1:nth-child(5)": [

353 "#side_nav > p.r_l_1:nth-child(5)"

354 ],

355 "#side_nav > p.r_l_3:nth-child(13)": [

356 "#side_nav > p.r_l_3:nth-child(13)"

357 ],

358 "#side_nav > p.r_l_2:nth-child(4)": [

359 "#side_nav > p.r_l_2:nth-child(4)"

360 ],

361 "#side_nav > p.r_l_3:nth-child(19)": [

362 "#side_nav > p.r_l_3:nth-child(19)"

363 ],

364 "#side_nav > p.r_l_3:nth-child(3)": [

365 "#side_nav > p.r_l_3:nth-child(3)"

366 ],

367 "#side_nav > div.w_l:nth-child(6)": [

368 "#side_nav > div.w_l:nth-child(6) > h4",

369 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1) > a",

370 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2) > a",

371 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3) > a",

372 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4) > a",

373 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5) > a",

374 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6) > a"

375 ],

376 "#side_nav > p.r_l_2:nth-child(18)": [

377 "#side_nav > p.r_l_2:nth-child(18)"

378 ],

379 "#side_nav > div.l_s:nth-child(12)": [

380 "#side_nav > div.l_s:nth-child(12)"

381 ],

382 "#cate_title_block": [

383 "#cate_title_title > div.cate_title",

384 "#cate_item_108698 > a",

385 "#cate_item_2 > a",

386 "#cate_item_108701 > a",

387 "#cate_item_108703 > a",

388 "#cate_item_108704 > a",

389 "#cate_item_108705 > a",

390 "#cate_item_108709 > a",

391 "#cate_item_108712 > a",

392 "#cate_item_108724 > a",

393 "#cate_item_4 > a",

394 "#cate_item_0 > a",

395 "#cate_item_-1 > a",

396 "#cate_title_block > div.cate_bottom",

397 "#cate_sub_block",

398 "#cate_title_block > script"

399 ],

400 "#side_nav > div.l_s:nth-child(2)": [

401 "#side_nav > div.l_s:nth-child(2)"

402 ],

403 "#side_nav > p.r_l_1:nth-child(17)": [

404 "#side_nav > p.r_l_1:nth-child(17)"

405 ],

406 "#side_nav > p.r_l_2:nth-child(14)": [

407 "#side_nav > p.r_l_2:nth-child(14)"

408 ],

409 "#side_nav > p.r_l_1:nth-child(15)": [

410 "#side_nav > p.r_l_1:nth-child(15)"

411 ],

412 "#user_stats": [

413 "#user_stats"

414 ],

415 "#side_nav > div.l_s:nth-child(10)": [

416 "#side_nav > div.l_s:nth-child(10)"

417 ]

418 },

419 "scroce": "975"

420 },

421 {

422 "list_sel": "#paging_block > div.pager",

423 "list_dom": {

424 "#paging_block > div.pager > a.p_9.middle": [

425 "#paging_block > div.pager > a.p_9.middle"

426 ],

427 "#paging_block > div.pager > a.p_7.middle": [

428 "#paging_block > div.pager > a.p_7.middle"

429 ],

430 "#paging_block > div.pager > a.p_8.middle": [

431 "#paging_block > div.pager > a.p_8.middle"

432 ],

433 "#paging_block > div.pager > a:nth-child(14)": [

434 "#paging_block > div.pager > a:nth-child(14)"

435 ],

436 "#paging_block > div.pager > a.p_11.middle": [

437 "#paging_block > div.pager > a.p_11.middle"

438 ],

439 "#paging_block > div.pager > a.p_3.middle": [

440 "#paging_block > div.pager > a.p_3.middle"

441 ],

442 "#paging_block > div.pager > a.p_4.middle": [

443 "#paging_block > div.pager > a.p_4.middle"

444 ],

445 "#paging_block > div.pager > a.p_10.middle": [

446 "#paging_block > div.pager > a.p_10.middle"

447 ],

448 "#paging_block > div.pager > a.p_2.middle": [

449 "#paging_block > div.pager > a.p_2.middle"

450 ],

451 "#paging_block > div.pager > a.p_5.middle": [

452 "#paging_block > div.pager > a.p_5.middle"

453 ],

454 "#paging_block > div.pager > a.p_6.middle": [

455 "#paging_block > div.pager > a.p_6.middle"

456 ],

457 "#paging_block > div.pager > a.p_1.current": [

458 "#paging_block > div.pager > a.p_1.current"

459 ],

460 "#paging_block > div.pager > span.ellipsis": [

461 "#paging_block > div.pager > span.ellipsis"

462 ],

463 "#paging_block > div.pager > a.p_200.last": [

464 "#paging_block > div.pager > a.p_200.last"

465 ]

466 },

467 "scroce": "865"

468 },

469 {

470 "list_sel": "#main > div.post_nav_block_wrapper > ul.post_nav_block",

471 "list_dom": {

472 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(1)": [

473 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(1) > a.current_nav"

474 ],

475 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(3)": [

476 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(3) > a"

477 ],

478 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(2)": [

479 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(2) > a"

480 ],

481 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(5)": [

482 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(5) > a"

483 ],

484 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(4)": [

485 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(4) > a"

486 ],

487 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(7)": [

488 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(7) > a"

489 ],

490 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(6)": [

491 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(6) > a"

492 ]

493 },

494 "scroce": "590"

495 },

496 {

497 "list_sel": "#nav_menu",

498 "list_dom": {

499 "#nav_menu > a:nth-child(3)": [

500 "#nav_menu > a:nth-child(3)"

501 ],

502 "#nav_menu > a:nth-child(2)": [

503 "#nav_menu > a:nth-child(2)"

504 ],

505 "#nav_menu > a:nth-child(5)": [

506 "#nav_menu > a:nth-child(5)"

507 ],

508 "#nav_menu > a:nth-child(4)": [

509 "#nav_menu > a:nth-child(4)"

510 ],

511 "#nav_menu > a:nth-child(1)": [

512 "#nav_menu > a:nth-child(1)"

513 ],

514 "#nav_menu > a:nth-child(7)": [

515 "#nav_menu > a:nth-child(7)"

516 ],

517 "#nav_menu > a:nth-child(6)": [

518 "#nav_menu > a:nth-child(6)"

519 ],

520 "#nav_menu > a:nth-child(9)": [

521 "#nav_menu > a:nth-child(9)"

522 ],

523 "#nav_menu > a:nth-child(8)": [

524 "#nav_menu > a:nth-child(8)"

525 ]

526 },

527 "scroce": "486"

528 },

529 {

530 "list_sel": "#side_nav > div.w_l:nth-child(6) > ul",

531 "list_dom": {

532 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3)": [

533 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3) > a"

534 ],

535 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2)": [

536 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2) > a"

537 ],

538 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1)": [

539 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1) > a"

540 ],

541 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6)": [

542 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6) > a"

543 ],

544 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5)": [

545 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5) > a"

546 ],

547 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4)": [

548 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4) > a"

549 ]

550 },

551 "scroce": "486"

552 },

553 {

554 "list_sel": "#headline_block > ul",

555 "list_dom": {

556 "#headline_block > ul > li:nth-child(4)": [

557 "#headline_block > ul > li:nth-child(4) > a:nth-child(1)",

558 "#headline_block > ul > li:nth-child(4) > a.right_more"

559 ],

560 "#headline_block > ul > li.editor_pick": [

561 "#editor_pick_count",

562 "#headline_block > ul > li.editor_pick > a.right_more"

563 ],

564 "#headline_block > ul > li:nth-child(3)": [

565 "#headline_block > ul > li:nth-child(3) > a:nth-child(1)",

566 "#headline_block > ul > li:nth-child(3) > a.right_more"

567 ],

568 "#headline_block > ul > li:nth-child(2)": [

569 "#headline_block > ul > li:nth-child(2) > a:nth-child(1)",

570 "#headline_block > ul > li:nth-child(2) > a.right_more"

571 ]

572 },

573 "scroce": "407"

574 },

575 {

576 "list_sel": "#header",

577 "list_dom": {

578 "#header > p.h_r_3:nth-child(1)": [

579 "#header > p.h_r_3:nth-child(1)"

580 ],

581 "#header > p.h_r_2:nth-child(6)": [

582 "#header > p.h_r_2:nth-child(6)"

583 ],

584 "#header > p.h_r_1:nth-child(3)": [

585 "#header > p.h_r_1:nth-child(3)"

586 ],

587 "#header > p.h_r_2:nth-child(2)": [

588 "#header > p.h_r_2:nth-child(2)"

589 ],

590 "#header > p.h_r_1:nth-child(5)": [

591 "#header > p.h_r_1:nth-child(5)"

592 ],

593 "#header > p.h_r_3:nth-child(7)": [

594 "#header > p.h_r_3:nth-child(7)"

595 ],

596 "#header_block": [

597 "#logo > h1 > a > img",

598 "#header_block > div.clear"

599 ]

600 },

601 "scroce": "335"

602 }

603 ],

604 "ifrs": []

605 }

View Code

10.上述第8步标记文件效果:

   红色虚线框起来的是返回的json结果中list中的list_sel选择器选中的元素

分析结果统计:

处理了将近1万的网站发现,大致的网页列表结构可以发现,平时时间大致在2-3s左右,因为用的是jsoup访问的网页,包含了网页响应的时间,时间复杂度待优化,

分析结果对于一些比较复杂乱的网页支持有待加强,代码写的比较乱,有待优化,应该会有更好的处理方式,还请指教,相互学习交流。

转载请注明出处:https://www.cnblogs.com/jstarseven/p/9501210.html

源码地址:https://github.com/jstarseven/list-autofire

  


 -END-

以上是 java实现网页结构分析列表发现 的全部内容, 来源链接: utcz.com/z/391781.html

回到顶部