java实现网页结构分析列表发现
原文出处:http://www.yund.tech/zdetail.html?type=1&id=ee06002e2b83e7677c30aedc52d3429e
作者:jstarseven
现在的网站千奇百怪,什么样格式的都有,需要提取网页中的列表数据,有时候挨个分析处理很头疼,本文是一个页面结构分析的程序,可以分析处理页面大致列表结构。
废话不多说,我也不会说,show me code,code is terrible,so what hahaha。-------jstarseven
1.抽取元素dom结构框架
1 /**2 * 分析元素dom结构框架
3 *
4 * @param node
5 * @return
6 */
7 public String filterHtml(Element node) {
8 //去除节点的属性值
9 Document new_node = Jsoup.parse(node.outerHtml());
10 Elements elements = new_node.getAllElements();
11 for (Element item : elements) {
12 Attributes attributes = item.attributes();
13 for (Attribute a : attributes) {
14 if (a.getKey().equals(KeysEnum.attr_scroce)) {
15 item.removeAttr(a.getKey());
16 continue;
17 }
18 a.setValue(StringUtils.EMPTY);
19 }
20 }
21 //去除注释节点,节点文本内容
22 String str_new = new_node.outerHtml().replaceAll("<!--?(.*?)-->", "");
23 str_new = str_new.replaceAll("\\s*", "");
24 str_new = str_new.replaceAll(">(.*?)<", "><");
25 return str_new;
26 }
2.采用动态规划处理两个字符串相似度
1 public class SimilarDegree {2
3 public static final double degree = 0.8;
4
5 /**
6 * 采用动态规划的方法解决
7 *
8 * @param source
9 * @param target
10 * @return
11 */
12 public static int EditDistance(String source, String target) {
13 char[] sources = source.toCharArray();
14 char[] targets = target.toCharArray();
15 int sourceLen = sources.length;
16 int targetLen = targets.length;
17 int[][] d = new int[sourceLen + 1][targetLen + 1];
18 for (int i = 0; i <= sourceLen; i++) {
19 d[i][0] = i;
20 }
21 for (int i = 0; i <= targetLen; i++) {
22 d[0][i] = i;
23 }
24
25 for (int i = 1; i <= sourceLen; i++) {
26 for (int j = 1; j <= targetLen; j++) {
27 if (sources[i - 1] == targets[j - 1]) {
28 d[i][j] = d[i - 1][j - 1];
29 } else {
30 //插入
31 int insert = d[i][j - 1] + 1;
32 //删除
33 int delete = d[i - 1][j] + 1;
34 //替换
35 int replace = d[i - 1][j - 1] + 1;
36 d[i][j] = Math.min(insert, delete) > Math.min(delete, replace) ? Math.min(delete, replace) :
37 Math.min(insert, delete);
38 }
39 }
40 }
41 return d[sourceLen][targetLen];
42 }
43
44 public static void main(String[] args) {
45 System.out.println(EditDistance("html > body > ul > li.proiect_item:nth-child(1) > div.item_row.item_row_title > div:nth-child(1) > a",
46 "html > body > ul > li.proiect_item:nth-child(2) > div.item_row.item_row_title > div:nth-child(1) > a"));
47 }
48
49 }
View Code
3.对网页中每个节点的一级孩子节点分类
1 /**2 * 统计列表下各个一级节点类型及个数
3 *
4 * @param node
5 * @return
6 */
7 private Map<String, Integer> getGroupNode(Element node) {
8 Map<String, Integer> map = new HashMap<String, Integer>();
9 Elements children = node.children();
10 for (Element item : children) {
11 if (KeysEnum.input.equalsIgnoreCase(item.tagName()) || KeysEnum.br.equalsIgnoreCase(item.tagName())
12 || KeysEnum.script.equalsIgnoreCase(item.tagName()) || KeysEnum.link.equalsIgnoreCase(item.tagName())
13 || KeysEnum.style.equalsIgnoreCase(item.tagName()) || KeysEnum.meta.equalsIgnoreCase(item.tagName())
14 || KeysEnum.select.equalsIgnoreCase(item.tagName()) || KeysEnum.option.equalsIgnoreCase(item.tagName())
15 || KeysEnum.video.equals(item.tagName()) || KeysEnum.audio.equals(item.tagName())
16 || KeysEnum.textarea.equals(item.tagName())) continue;
17 String key = filterHtml(item);
18 if (map.containsKey(key)) {
19 map.put(key, (Integer) map.get(key) + 1);
20 } else {
21 boolean is_like = false;
22 for (String map_key : map.keySet()) {
23 int dis = SimilarDegree.EditDistance(key, (String) map_key);
24 float v = (float) (key.length() - dis) / key.length();
25 if (v > SimilarDegree.degree) {
26 map.put(map_key, (Integer) map.get(map_key) + 1);
27 is_like = true;
28 break;
29 }
30 }
31 if (!is_like) map.put(key, 1);
32 }
33 }
34 return map;
35 }
4.处理网页中每个元素的叶子节点
1 /**2 * 获取叶子节点选择器
3 *
4 * @param node
5 */
6 public static List<String> getYeziNodeSel(Element node) {
7 List<String> list = new ArrayList<String>();
8 Elements all = node.getAllElements();
9 for (Element item : all) {
10 if (item.children().isEmpty()) list.add(item.cssSelector());
11 }
12 return list;
13 }
5.时间提取工具类
1 /**2 * jstarseven
3 * 通用时间处理类 return Date
4 * */
5 public class DateParser {
6 private static int timezone = 0;
7 private static final Pattern[] DPTN = {
8
9 Pattern.compile(
10 "(\\d{1,2})[\\s\\-\\/](\\d{1,2})[\\s\\-\\/](20\\d{2})\\s{0,2}((\\d{1,2})[:\\s](\\d{1,2})[:\\s]?(\\d{1,2})?)?"),
11
12 Pattern.compile(
13 "((20)?\\d{2}) {0,2}[\\.\\-/年] {0,2}(\\d{1,2}) {0,2}[\\.\\-/月] {0,2}(\\d{1,2}) {0,2}[日 \\s]{0,2}((上午)|(下午))?\\s{0,2}((\\d{1,2})[:\\s时](\\d{1,2})[:\\s分]?(\\d{1,2})?)?"),
14
15 Pattern.compile("((20)?\\d{2})/(\\d{2})(\\d{2})"),
16
17 Pattern.compile(
18 "(\\d{1,2})[\\.\\-\\s/月](\\d{1,2})[日\\s]{0,2}((上午)|(下午))?\\s{0,2}((\\d{1,2})[:\\s](\\d{1,2})[:\\s]?(\\d{1,2})?)?"),
19
20 Pattern.compile("([今前昨]天)?\\s{0,4}(\\d{1,2})[:\\s]{1,3}(\\d{1,2})[:\\s]?(\\d{1,2})?"),
21
22 Pattern.compile("[今前昨]天"),
23
24 Pattern.compile("((\\d{1,2})|(半))\\s*个?([天秒小时分钟周月年]{1,2})前"),
25
26 Pattern.compile("(\\d{1,2})小?时(\\d{1,2})分钟?前"),
27
28 Pattern.compile("(20\\d{2})[01]?(\\d{2})[012]?(\\d{2})") };
29
30 public static Date parse(Object obj) {
31 if (obj == null) {
32 return null;
33 }
34 if ((obj instanceof Date)) {
35 return (Date) obj;
36 }
37 if ((obj instanceof Number)) {
38 return new Date(((Number) obj).longValue());
39 }
40 String str = ((String) obj).trim();
41 if ((str.length() == 0) || ("null".equalsIgnoreCase(str))) {
42 return null;
43 }
44 str = transZH(str);
45 Calendar c = Calendar.getInstance();
46 c.setTimeInMillis(System.currentTimeMillis());
47
48 Matcher mt = DPTN[0].matcher(str);
49 if (mt.find()) {
50 int date = Integer.parseInt(mt.group(2));
51 if ((date == 0) || (date > 31)) {
52 return null;
53 }
54 int month = Integer.parseInt(mt.group(1));
55 if (month <= 0) {
56 return null;
57 }
58 if (month > 12) {
59 if ((date > 0) && (date <= 12) && (month < 32)) {
60 int tmp = month;
61 month = date;
62 date = tmp;
63 } else {
64 return null;
65 }
66 }
67 String sy = mt.group(3);
68 int year = Integer.parseInt(sy);
69 if ((year < 2000) || (year > 2099)) {
70 return null;
71 }
72 String hms = mt.group(4);
73 if ((hms == null) || (hms.length() == 0)) {
74 c.set(year, month - 1, date, timezone > 0 ? timezone : 0, 0, 0);
75 return c.getTime();
76 }
77 int hour = Integer.parseInt(mt.group(5));
78 if (hour >= 24) {
79 return null;
80 }
81 int min = Integer.parseInt(mt.group(6));
82 if (min >= 60) {
83 return null;
84 }
85 String ssec = mt.group(7);
86 int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);
87 c.set(year, month - 1, date, hour, min, sec);
88 return c.getTime();
89 }
90 mt = DPTN[1].matcher(str);
91 if (mt.find()) {
92 String sy = mt.group(1);
93 if (sy.length() == 2) {
94 sy = "20" + sy;
95 }
96 int year = Integer.parseInt(sy);
97 if ((year < 2000) || (year > 2099)) {
98 return null;
99 }
100 int month = Integer.parseInt(mt.group(3)) - 1;
101 if ((month < 0) || (month > 11)) {
102 return null;
103 }
104 int date = Integer.parseInt(mt.group(4));
105 if (date > 31) {
106 return null;
107 }
108 String ss = mt.group(8);
109 if ((ss == null) || (ss.length() == 0)) {
110 c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);
111 return c.getTime();
112 }
113 int hour = Integer.parseInt(mt.group(9));
114 if (hour >= 24) {
115 return null;
116 }
117 int min = Integer.parseInt(mt.group(10));
118 if (min >= 60) {
119 return null;
120 }
121 String ssec = mt.group(11);
122 int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);
123 if (("下午".equals(mt.group(5))) && (hour < 12)) {
124 hour += 12;
125 }
126 c.set(year, month, date, hour, min, sec);
127 return c.getTime();
128 }
129 mt = DPTN[2].matcher(str);
130 if (mt.find()) {
131 String strYear = mt.group(1);
132 if (!strYear.startsWith("20")) {
133 strYear = "20" + strYear;
134 }
135 int year = Integer.parseInt(strYear);
136 int month = Integer.parseInt(mt.group(3)) - 1;
137 int day = Integer.parseInt(mt.group(4));
138 c.set(year, month, day, 0, 0, 0);
139 return c.getTime();
140 }
141 mt = DPTN[3].matcher(str);
142 if (mt.find()) {
143 int year = c.get(1);
144 int month = Integer.parseInt(mt.group(1)) - 1;
145 if (month < 0) {
146 return null;
147 }
148 if (month > c.get(2)) {
149 year--;
150 }
151 int date = Integer.parseInt(mt.group(2));
152 if (date > 31) {
153 return null;
154 }
155 String p = mt.group(6);
156 if ((p == null) || (p.length() == 0)) {
157 c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);
158 return c.getTime();
159 }
160 int hour = Integer.parseInt(mt.group(7));
161 if (hour >= 24) {
162 return null;
163 }
164 int min = Integer.parseInt(mt.group(8));
165 if (min >= 60) {
166 return null;
167 }
168 String ssec = mt.group(9);
169 int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);
170 if (("下午".equals(mt.group(3))) && (hour < 12)) {
171 hour += 12;
172 }
173 c.set(year, month, date, hour, min, sec);
174 return c.getTime();
175 }
176 mt = DPTN[4].matcher(str);
177 if (mt.find()) {
178 int hour = Integer.parseInt(mt.group(2));
179 if (hour >= 24) {
180 return null;
181 }
182 int min = Integer.parseInt(mt.group(3));
183 if (min >= 60) {
184 return null;
185 }
186 String day = mt.group(1);
187 if ("昨天".equals(day)) {
188 c.add(5, -1);
189 } else if ("前天".equals(day)) {
190 c.add(5, -2);
191 }
192 c.set(11, hour);
193 c.set(12, min);
194 return c.getTime();
195 }
196 mt = DPTN[5].matcher(str);
197 if (mt.find()) {
198 String day = mt.group(0);
199 if ("昨天".equals(day)) {
200 c.add(5, -1);
201 } else if ("前天".equals(day)) {
202 c.add(5, -2);
203 }
204 return c.getTime();
205 }
206 mt = DPTN[6].matcher(str);
207 if (mt.find()) {
208 String s = mt.group(4);
209 long t;
210 if ("年".equals(s)) {
211 t = 31536000000L;
212 } else {
213 if ("月".equals(s)) {
214 t = 2592000000L;
215 } else {
216 if ("周".equals(s)) {
217 t = 604800000L;
218 } else {
219 if ("天".equals(s)) {
220 t = 86400000L;
221 } else {
222 if ("小时".equals(s)) {
223 t = 3600000L;
224 } else {
225 if ("时".equals(s)) {
226 t = 3600000L;
227 } else {
228 if ("分钟".equals(s)) {
229 t = 60000L;
230 } else {
231 if ("分".equals(s)) {
232 t = 60000L;
233 } else {
234 if ("秒".equals(s)) {
235 t = 1000L;
236 } else {
237 return null;
238 }
239 }
240 }
241 }
242 }
243 }
244 }
245 }
246 }
247 String vs = mt.group(1);
248 if ("半".equals(vs)) {
249 t = System.currentTimeMillis() - t / 2L;
250 } else {
251 t = System.currentTimeMillis() - Integer.parseInt(vs) * t;
252 }
253 return new Date(t);
254 }
255 mt = DPTN[7].matcher(str);
256 if (mt.find()) {
257 int hh = Integer.parseInt(mt.group(1));
258 int nn = Integer.parseInt(mt.group(2));
259 long t = 3600000 * hh + 60000 * nn;
260 return new Date(System.currentTimeMillis() - t);
261 }
262 mt = DPTN[8].matcher(str);
263 if (mt.find()) {
264 String sy = mt.group(1);
265 int year = Integer.parseInt(sy);
266 if ((year < 2000) || (year > 2099)) {
267 return null;
268 }
269 int month = Integer.parseInt(mt.group(2)) - 1;
270 if ((month < 0) || (month > 11)) {
271 return null;
272 }
273 int date = Integer.parseInt(mt.group(3));
274 if (date > 31) {
275 return null;
276 }
277 c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);
278 return c.getTime();
279 }
280 return null;
281 }
282
283 private static String transZH(String string) {
284 String zh = "〇一二三四五六七八九";
285 string = string.replace("整", "0分").replaceAll("[上下]午", "");
286 StringBuffer buffer = new StringBuffer();
287 for (Character Char : string.toCharArray()) {
288 int index = zh.indexOf(Char);
289 if (index >= 0) {
290 buffer.append(index);
291 } else {
292 buffer.append(Char);
293 }
294 }
295 String str = buffer.toString();
296 int index = str.indexOf("十");
297 if (index == -1) {
298 return str;
299 } else {
300 if (!Character.isDigit(str.charAt(index-1)) && !Character.isDigit(str.charAt(index+1))) {
301 str=str.replace("十", "10");
302 }else if (Character.isDigit(str.charAt(index-1)) && !Character.isDigit(str.charAt(index+1))) {
303 str=str.replace("十", "0");
304 }else if(!Character.isDigit(str.charAt(index-1)) && Character.isDigit(str.charAt(index+1))){
305 str=str.replace("十", "1");
306 }else if(Character.isDigit(str.charAt(index-1)) && Character.isDigit(str.charAt(index+1))){
307 str=str.replace("十", "");
308 }
309 return str;
310 }
311
312 }
313
314 public static void main(String[] args) {
315 System.out.println(parse("1982-01-01 00:00:00"));
316 System.out.println(transZH("二〇一七年九月十日 上午十时整"));
317 System.out.println(transZH("二〇一七年九月二十日 上午九时整"));
318 System.out.println(transZH("二〇一七年九月十九日 上午九时整"));
319 System.out.println(transZH("二〇一七年九月二十三日 上午九时整"));
320 System.out.println("timezone=" + timezone);
321 String[] testdata = { "1982-01-01 00:00:00","11-13 15:24", "2009-8-30 16:42:10", "8-23 15:24", "2周前", "3 天前", "12 分钟前", "3天前",
322 "前天 09:36", "昨天 09:21 ", "2010-12-17 00:23 ", "2010-12-17 ", "昨天 12:37 ", "2011-8-15 08:42",
323 "25-7-2011 11:43:57", "1-9-2011", "06-03", "半小时前", "今天发表", "昨天发表", "前天发表", "06-03-2010",
324 "02-01-2010 00:39", "3小时26分钟前", "2010-8-24 上午 01:17:32", "2010-8-24 下午 01:17:32", "7小时前 »",
325 "4/29/2010 1:31:00", "2012 年 1 月 31 日", "17时20分前", "2017年10月12日 14时30分", "二〇一七年九月十九日 上午九时整" };
326
327 DateFormat df = DateFormat.getDateTimeInstance(2, 2);
328 for (String s : testdata) {
329 Date d = parse(s);
330 System.out.println(s + "\t\t" + (d == null ? d : df.format(d)));
331 }
332 }
333
334 }
View Code
6.自定义比较器对网页所有元素排序,发现结果靠前的基本都是列表元素
比较器:按照疑似列表的可能性
1 /**2 * 排序子节点
3 * 1.最大相同dom结构长度
4 * 2.最大相同dom结构元素数量
5 *
6 * @param nodes
7 * @return
8 */
9 private Elements sortBy(Elements nodes, String base_url) {
10 // System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");
11 nodes.sort(new Comparator<Element>() {
12 @Override
13 public int compare(Element o1, Element o2) {
14 double o1_rate = reckonRate(o1);
15 double o2_rate = reckonRate(o2);
16 return (o2_rate > o1_rate) ? 1 : ((o2_rate == o1_rate) ? 0 : -1);
17 }
18
19 private double reckonRate(Element o) {
20 if (StringUtils.isNotBlank(base_url) && KeysEnum.a.equalsIgnoreCase(o.tagName()) && base_url.equalsIgnoreCase(o.attr(KeysEnum.attr_href)))
21 o.attr(KeysEnum.attr_list_tag_name, o.text());
22 if (null == o || o.children().size() < 2
23 || KeysEnum.html.equalsIgnoreCase(o.tagName()) || KeysEnum.body.equalsIgnoreCase(o.tagName()) || KeysEnum.link.equalsIgnoreCase(o.tagName())
24 || KeysEnum.head.equalsIgnoreCase(o.tagName()) || KeysEnum.title.equalsIgnoreCase(o.tagName()) || KeysEnum.meta.equalsIgnoreCase(o.tagName())
25 || KeysEnum.script.equalsIgnoreCase(o.tagName()) || KeysEnum.style.equalsIgnoreCase(o.tagName())) {
26 o.attr(KeysEnum.attr_scroce, "0");
27 return 0;
28 }
29 String style = o.attr(KeysEnum.style);
30 if (StringUtils.isNotBlank(style) && style.contains(KeysEnum.display_none)) {
31 o.attr(KeysEnum.attr_scroce, "0");
32 return 0;
33 }
34 Map<String, Object> maxKeyDom = getMaxKeyDom(o);
35 String key = (String) maxKeyDom.get(KeysEnum.max_key);
36 int num = (int) maxKeyDom.get(KeysEnum.max_num);
37 if (num < 2) {
38 o.attr(KeysEnum.attr_scroce, "0");
39 return 0;
40 }
41 int scroce = num * key.length();
42 Elements tags = o.children();
43 for (Element a : tags) {
44 if (KeysEnum.div.equalsIgnoreCase(a.tagName())) scroce += 5;
45 if (KeysEnum.ul.equalsIgnoreCase(a.tagName())) scroce += 10;
46 if (KeysEnum.li.equalsIgnoreCase(a.tagName())) scroce += 10;
47 if (KeysEnum.tbody.equalsIgnoreCase(a.tagName())) scroce += 5;
48 if (KeysEnum.table.equalsIgnoreCase(a.tagName())) scroce += 5;
49 if (KeysEnum.tr.equalsIgnoreCase(a.tagName())) scroce += 10;
50 if (KeysEnum.td.equalsIgnoreCase(a.tagName())) scroce += 1;
51 if (KeysEnum.a.equalsIgnoreCase(a.tagName())) scroce += 1;
52 if (KeysEnum.p.equalsIgnoreCase(a.tagName())) scroce += 1;
53 try {
54 Date time = DateParser.parse(a.text());
55 if (null != time) scroce += 20;
56 } catch (Exception e) {
57 }
58 }
59 if (o.text().contains(KeysEnum.next_page)) scroce += 100;
60 if (o.text().contains(KeysEnum.start_page) || o.text().contains(KeysEnum.fisrt_page)) scroce += 100;
61 if (o.text().contains(KeysEnum.end_page) || o.text().contains(KeysEnum.last_page) || o.text().contains(KeysEnum.final_page))
62 scroce += 100;
63 o.attr(KeysEnum.attr_scroce, String.valueOf(scroce));
64 return scroce;
65 }
66 });
67 return nodes;
68 }
7.处理页面html,调用列表分析返回json结果
1 /**2 * 提取页面列表元素的选择器以及页面分类标签
3 *
4 * @param document
5 * @param is_subitem
6 * @return
7 */
8 public static Map<String, Object> dealListNode(Document document, boolean is_subitem) throws Exception {
9 Map<String, Object> result = new HashMap<String, Object>();
10 try {
11 ListAutoFire listAutoFire = new ListAutoFire();
12 Elements list_node = listAutoFire.autoFireListNodes(document);
13 List<Map<String, Object>> lists = new ArrayList();
14 if (null != list_node && list_node.size() > 0) {
15 for (Element list_sel_item : list_node) {
16 if (list_sel_item.hasAttr(KeysEnum.attr_list_tag_name) && StringUtils.isNotBlank(list_sel_item.attr(KeysEnum.attr_list_tag_name))) {
17 result.put(KeysEnum.tag_name, list_sel_item.attr(KeysEnum.attr_list_tag_name));
18 continue;
19 }
20 Map<String, Object> list_dom_frame = new HashMap<>();
21 list_dom_frame.put(KeysEnum.list_sel, list_sel_item.cssSelector());
22 if (is_subitem) {
23 Map<String, List<String>> listItem = new HashMap<String, List<String>>();
24 for (Element item : list_sel_item.children())
25 listItem.put(item.cssSelector(), getYeziNodeSel(item));
26 list_dom_frame.put(KeysEnum.list_dom, listItem);
27 }
28 list_dom_frame.put(KeysEnum.attr_scroce, list_sel_item.attr(KeysEnum.attr_scroce));
29 lists.add(list_dom_frame);
30 }
31 }
32 result.put(KeysEnum.list, lists);
33 } catch (Exception e) {
34 throw new Exception(KeysEnum.error_info, e.getCause());
35 }
36 return result;
37 }
38
39 /**
40 * 处理网页结构
41 *
42 * @param home_url 入口地址
43 * @param list_index 列表元素获取数量
44 * @param is_subitem 是否处理列表元素子项抽取 true/false
45 * @param is_ifr 是否处理iframe true/false
46 * @return
47 */
48 public static Map<String, Object> getWebSiteFrame(String home_url, int list_index, boolean is_subitem, boolean is_ifr) {
49 Map<String, Object> result = new HashMap<String, Object>();
50 if (StringUtils.isBlank(home_url)) return result;
51 try {
52 Document html = Jsoup.connect(home_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();
53 if (null == html) throw new Exception(KeysEnum.open_fail);
54 Map<String, Object> mapNode = dealListNode(html, is_subitem);
55 List listNode = (List) mapNode.get(KeysEnum.list);
56 result.put(KeysEnum.home_url, home_url);
57 result.put(KeysEnum.tag_name, mapNode.get(KeysEnum.tag_name));
58 result.put(KeysEnum.list, listNode.subList(0, listNode.size() > list_index ? list_index : listNode.size()));
59 result.put(KeysEnum.ifrs, new ArrayList());
60 if (is_ifr) {
61 List<Map<String, Object>> ifrs = (List<Map<String, Object>>) result.get(KeysEnum.ifrs);
62 Elements iframe_nodes = html.getElementsByTag(KeysEnum.iframe);
63 if (null != iframe_nodes) {
64 for (Element iframe : iframe_nodes) {
65 String iframe_url = iframe.attr(KeysEnum.attr_src);
66 if (StringUtils.isBlank(iframe_url)) continue;
67 try {
68 Document iframe_html = Jsoup.connect(iframe_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();
69 if (null == iframe_html) continue;
70 Map<String, Object> ifrMapNode = dealListNode(iframe_html, is_subitem);
71 List ifrListNode = (List) ifrMapNode.get(KeysEnum.list);
72 Map<String, Object> ifr_map = new HashMap();
73 ifr_map.put(KeysEnum.home_url, iframe_url);
74 ifr_map.put(KeysEnum.tag_name, ifrMapNode.get(KeysEnum.tag_name));
75 ifr_map.put(KeysEnum.list, ifrListNode.subList(0, ifrListNode.size() > list_index ? list_index : ifrListNode.size()));
76 ifrs.add(ifr_map);
77 } catch (Exception e) {
78 e.printStackTrace();
79 }
80 }
81 }
82 }
83 } catch (Exception e) {
84 e.printStackTrace();
85 result.clear();
86 result.put(KeysEnum.home_url, home_url);
87 result.put(KeysEnum.error, KeysEnum.error_info);
88 result.put(KeysEnum.message, e.toString());
89 }
90 return result;
91 }
92
93 /**
94 * 处理网页结构
95 *
96 * @param home_url 入口地址
97 * @param list_index 列表元素获取数量
98 * @param is_subitem 是否处理列表元素子项抽取 true/false
99 * @return
100 */
101 public static Map<String, Object> getWebSiteFrame(String home_url, int list_index, boolean is_subitem) {
102 return getWebSiteFrame(home_url, list_index, is_subitem, false);
103 }
104
105 /**
106 * 处理网页结构
107 *
108 * @param home_url 入口地址
109 * @param list_index 列表元素获取数量
110 * @return
111 */
112 public static Map<String, Object> getWebSiteFrame(String home_url, int list_index) {
113 return getWebSiteFrame(home_url, list_index, false);
114 }
115
116 /**
117 * 处理网页结构
118 *
119 * @param home_url 入口地址
120 * @return
121 */
122 public static Map<String, Object> getWebSiteFrame(String home_url) {
123 return getWebSiteFrame(home_url, 10);
124 }
View Code
8.生成页面分析结果标记文件
1 public static void createMarkFile(Map siteFrame, String home_url, String path) {2 try {
3 Document doc = Jsoup.connect(home_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();
4 if (null == doc) return;
5 String style = ".mark_color {" +
6 "position:relative;" +
7 "pointer-events:none;" +
8 "left:0px;top:0px;" +
9 "display:inline-block;" +
10 "margin:-2px;width:100%;" +
11 "height:100%;" +
12 "border:dashed 2px #FF69B4;" +
13 "background-color: #43CD80;" +
14 "opacity:0.75;" +
15 "} " ;
16 List list = (List) siteFrame.get("list");
17 for (Object item : list) {
18 Map item_map = (Map) item;
19 String sel = (String) item_map.get("list_sel");
20 doc.select(sel).addClass("mark_color");
21 }
22 String content = doc.html();
23 content = content.contains("<base") ? content : content.replaceFirst("<head", "<base href='" + home_url + "'/><style>" + style + "</style><head");
24 FileUtils.writeStringToFile(new File(path), content, "UTF-8", false);
25
26 } catch (IOException e) {
27 e.printStackTrace();
28 }
29 }
View Code
9.上述第7步返回的结果实例:
拿cnblog首页做测试,返回结果:
字段解释:
home_url :分析的页面地址
tag_name :当前页面的类型,多数情况下不正确,我只是拿home_url和页面的url比对,取了对应的text
list:页面中疑似列表元素
list_sel:页面中疑似列表元素的选择器
list_dom:页面中疑似列表元素的 一级孩子节点元素,叶子元素选择器
ifrs:页面中包含iframe分析的结果,没有则为空
1 {2 "home_url": "https://www.cnblogs.com/",
3 "tag_name": "1",
4 "list": [
5 {
6 "list_sel": "#post_list",
7 "list_dom": {
8 "#post_list > div.post_item:nth-child(7)": [
9 "#digg_count_9500831",
10 "#post_list > div.post_item:nth-child(7) > div.digg > div.clear",
11 "#digg_tip_9500831",
12 "#post_list > div.post_item:nth-child(7) > div.post_item_body > h3 > a.titlelnk",
13 "#post_list > div.post_item:nth-child(7) > div.post_item_body > p.post_item_summary",
14 "#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > a.lightblue",
15 "#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
16 "#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
17 "#post_list > div.post_item:nth-child(7) > div.clear"
18 ],
19 "#post_list > div.post_item:nth-child(19)": [
20 "#digg_count_9499348",
21 "#post_list > div.post_item:nth-child(19) > div.digg > div.clear",
22 "#digg_tip_9499348",
23 "#post_list > div.post_item:nth-child(19) > div.post_item_body > h3 > a.titlelnk",
24 "#post_list > div.post_item:nth-child(19) > div.post_item_body > p.post_item_summary > a > img.pfs",
25 "#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > a.lightblue",
26 "#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
27 "#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
28 "#post_list > div.post_item:nth-child(19) > div.clear"
29 ],
30 "#post_list > div.post_item:nth-child(6)": [
31 "#digg_count_9500833",
32 "#post_list > div.post_item:nth-child(6) > div.digg > div.clear",
33 "#digg_tip_9500833",
34 "#post_list > div.post_item:nth-child(6) > div.post_item_body > h3 > a.titlelnk",
35 "#post_list > div.post_item:nth-child(6) > div.post_item_body > p.post_item_summary",
36 "#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > a.lightblue",
37 "#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
38 "#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
39 "#post_list > div.post_item:nth-child(6) > div.clear"
40 ],
41 "#post_list > div.post_item:nth-child(9)": [
42 "#digg_count_9500757",
43 "#post_list > div.post_item:nth-child(9) > div.digg > div.clear",
44 "#digg_tip_9500757",
45 "#post_list > div.post_item:nth-child(9) > div.post_item_body > h3 > a.titlelnk",
46 "#post_list > div.post_item:nth-child(9) > div.post_item_body > p.post_item_summary > a > img.pfs",
47 "#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > a.lightblue",
48 "#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
49 "#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
50 "#post_list > div.post_item:nth-child(9) > div.clear"
51 ],
52 "#post_list > div.post_item:nth-child(17)": [
53 "#digg_count_9495616",
54 "#post_list > div.post_item:nth-child(17) > div.digg > div.clear",
55 "#digg_tip_9495616",
56 "#post_list > div.post_item:nth-child(17) > div.post_item_body > h3 > a.titlelnk",
57 "#post_list > div.post_item:nth-child(17) > div.post_item_body > p.post_item_summary > a > img.pfs",
58 "#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > a.lightblue",
59 "#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
60 "#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
61 "#post_list > div.post_item:nth-child(17) > div.clear"
62 ],
63 "#post_list > div.post_item:nth-child(8)": [
64 "#digg_count_9500822",
65 "#post_list > div.post_item:nth-child(8) > div.digg > div.clear",
66 "#digg_tip_9500822",
67 "#post_list > div.post_item:nth-child(8) > div.post_item_body > h3 > a.titlelnk",
68 "#post_list > div.post_item:nth-child(8) > div.post_item_body > p.post_item_summary > a > img.pfs",
69 "#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > a.lightblue",
70 "#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
71 "#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
72 "#post_list > div.post_item:nth-child(8) > div.clear"
73 ],
74 "#post_list > div.post_item:nth-child(18)": [
75 "#digg_count_9499454",
76 "#post_list > div.post_item:nth-child(18) > div.digg > div.clear",
77 "#digg_tip_9499454",
78 "#post_list > div.post_item:nth-child(18) > div.post_item_body > h3 > a.titlelnk",
79 "#post_list > div.post_item:nth-child(18) > div.post_item_body > p.post_item_summary > a > img.pfs",
80 "#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > a.lightblue",
81 "#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
82 "#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
83 "#post_list > div.post_item:nth-child(18) > div.clear"
84 ],
85 "#post_list > div.post_item:nth-child(3)": [
86 "#digg_count_9500944",
87 "#post_list > div.post_item:nth-child(3) > div.digg > div.clear",
88 "#digg_tip_9500944",
89 "#post_list > div.post_item:nth-child(3) > div.post_item_body > h3 > a.titlelnk",
90 "#post_list > div.post_item:nth-child(3) > div.post_item_body > p.post_item_summary > a > img.pfs",
91 "#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > a.lightblue",
92 "#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
93 "#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
94 "#post_list > div.post_item:nth-child(3) > div.clear"
95 ],
96 "#post_list > div.post_item:nth-child(2)": [
97 "#digg_count_9500357",
98 "#post_list > div.post_item:nth-child(2) > div.digg > div.clear",
99 "#digg_tip_9500357",
100 "#post_list > div.post_item:nth-child(2) > div.post_item_body > h3 > a.titlelnk",
101 "#post_list > div.post_item:nth-child(2) > div.post_item_body > p.post_item_summary",
102 "#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > a.lightblue",
103 "#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
104 "#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
105 "#post_list > div.post_item:nth-child(2) > div.clear"
106 ],
107 "#post_list > div.post_item:nth-child(5)": [
108 "#digg_count_9500890",
109 "#post_list > div.post_item:nth-child(5) > div.digg > div.clear",
110 "#digg_tip_9500890",
111 "#post_list > div.post_item:nth-child(5) > div.post_item_body > h3 > a.titlelnk",
112 "#post_list > div.post_item:nth-child(5) > div.post_item_body > p.post_item_summary > a > img.pfs",
113 "#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > a.lightblue",
114 "#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
115 "#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
116 "#post_list > div.post_item:nth-child(5) > div.clear"
117 ],
118 "#post_list > div.post_item:nth-child(4)": [
119 "#digg_count_9500935",
120 "#post_list > div.post_item:nth-child(4) > div.digg > div.clear",
121 "#digg_tip_9500935",
122 "#post_list > div.post_item:nth-child(4) > div.post_item_body > h3 > a.titlelnk",
123 "#post_list > div.post_item:nth-child(4) > div.post_item_body > p.post_item_summary > a > img.pfs",
124 "#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > a.lightblue",
125 "#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
126 "#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
127 "#post_list > div.post_item:nth-child(4) > div.clear"
128 ],
129 "#post_list > div.post_item:nth-child(1)": [
130 "#digg_count_9501071",
131 "#post_list > div.post_item:nth-child(1) > div.digg > div.clear",
132 "#digg_tip_9501071",
133 "#post_list > div.post_item:nth-child(1) > div.post_item_body > h3 > a.titlelnk",
134 "#post_list > div.post_item:nth-child(1) > div.post_item_body > p.post_item_summary > a > img.pfs",
135 "#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > a.lightblue",
136 "#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
137 "#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
138 "#post_list > div.post_item:nth-child(1) > div.clear"
139 ],
140 "#post_list > div.post_item:nth-child(15)": [
141 "#digg_count_9403762",
142 "#post_list > div.post_item:nth-child(15) > div.digg > div.clear",
143 "#digg_tip_9403762",
144 "#post_list > div.post_item:nth-child(15) > div.post_item_body > h3 > a.titlelnk",
145 "#post_list > div.post_item:nth-child(15) > div.post_item_body > p.post_item_summary > a > img.pfs",
146 "#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > a.lightblue",
147 "#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
148 "#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
149 "#post_list > div.post_item:nth-child(15) > div.clear"
150 ],
151 "#post_list > div.post_item:nth-child(16)": [
152 "#digg_count_9499534",
153 "#post_list > div.post_item:nth-child(16) > div.digg > div.clear",
154 "#digg_tip_9499534",
155 "#post_list > div.post_item:nth-child(16) > div.post_item_body > h3 > a.titlelnk",
156 "#post_list > div.post_item:nth-child(16) > div.post_item_body > p.post_item_summary > a > img.pfs",
157 "#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > a.lightblue",
158 "#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
159 "#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
160 "#post_list > div.post_item:nth-child(16) > div.clear"
161 ],
162 "#post_list > div.post_item:nth-child(13)": [
163 "#digg_count_9465698",
164 "#post_list > div.post_item:nth-child(13) > div.digg > div.clear",
165 "#digg_tip_9465698",
166 "#post_list > div.post_item:nth-child(13) > div.post_item_body > h3 > a.titlelnk",
167 "#post_list > div.post_item:nth-child(13) > div.post_item_body > p.post_item_summary > a > img.pfs",
168 "#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > a.lightblue",
169 "#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
170 "#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
171 "#post_list > div.post_item:nth-child(13) > div.clear"
172 ],
173 "#post_list > div.post_item:nth-child(14)": [
174 "#digg_count_9498410",
175 "#post_list > div.post_item:nth-child(14) > div.digg > div.clear",
176 "#digg_tip_9498410",
177 "#post_list > div.post_item:nth-child(14) > div.post_item_body > h3 > a.titlelnk",
178 "#post_list > div.post_item:nth-child(14) > div.post_item_body > p.post_item_summary > a > img.pfs",
179 "#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > a.lightblue",
180 "#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
181 "#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
182 "#post_list > div.post_item:nth-child(14) > div.clear"
183 ],
184 "#post_list > div.post_item:nth-child(11)": [
185 "#digg_count_9500633",
186 "#post_list > div.post_item:nth-child(11) > div.digg > div.clear",
187 "#digg_tip_9500633",
188 "#post_list > div.post_item:nth-child(11) > div.post_item_body > h3 > a.titlelnk",
189 "#post_list > div.post_item:nth-child(11) > div.post_item_body > p.post_item_summary",
190 "#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > a.lightblue",
191 "#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
192 "#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
193 "#post_list > div.post_item:nth-child(11) > div.clear"
194 ],
195 "#post_list > div.post_item:nth-child(12)": [
196 "#digg_count_9500352",
197 "#post_list > div.post_item:nth-child(12) > div.digg > div.clear",
198 "#digg_tip_9500352",
199 "#post_list > div.post_item:nth-child(12) > div.post_item_body > h3 > a.titlelnk",
200 "#post_list > div.post_item:nth-child(12) > div.post_item_body > p.post_item_summary > a > img.pfs",
201 "#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > a.lightblue",
202 "#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
203 "#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
204 "#post_list > div.post_item:nth-child(12) > div.clear"
205 ],
206 "#post_list > div.post_item:nth-child(20)": [
207 "#digg_count_9499225",
208 "#post_list > div.post_item:nth-child(20) > div.digg > div.clear",
209 "#digg_tip_9499225",
210 "#post_list > div.post_item:nth-child(20) > div.post_item_body > h3 > a.titlelnk",
211 "#post_list > div.post_item:nth-child(20) > div.post_item_body > p.post_item_summary > a > img.pfs",
212 "#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > a.lightblue",
213 "#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
214 "#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
215 "#post_list > div.post_item:nth-child(20) > div.clear"
216 ],
217 "#post_list > div.post_item:nth-child(10)": [
218 "#digg_count_9500632",
219 "#post_list > div.post_item:nth-child(10) > div.digg > div.clear",
220 "#digg_tip_9500632",
221 "#post_list > div.post_item:nth-child(10) > div.post_item_body > h3 > a.titlelnk",
222 "#post_list > div.post_item:nth-child(10) > div.post_item_body > p.post_item_summary > a > img.pfs",
223 "#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > a.lightblue",
224 "#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
225 "#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
226 "#post_list > div.post_item:nth-child(10) > div.clear"
227 ]
228 },
229 "scroce": "9860"
230 },
231 {
232 "list_sel": "#cate_item",
233 "list_dom": {
234 "#cate_item_108705": [
235 "#cate_item_108705 > a"
236 ],
237 "#cate_item_108704": [
238 "#cate_item_108704 > a"
239 ],
240 "#cate_item_108703": [
241 "#cate_item_108703 > a"
242 ],
243 "#cate_item_4": [
244 "#cate_item_4 > a"
245 ],
246 "#cate_item_2": [
247 "#cate_item_2 > a"
248 ],
249 "#cate_item_108709": [
250 "#cate_item_108709 > a"
251 ],
252 "#cate_item_0": [
253 "#cate_item_0 > a"
254 ],
255 "#cate_item_108698": [
256 "#cate_item_108698 > a"
257 ],
258 "#cate_item_108724": [
259 "#cate_item_108724 > a"
260 ],
261 "#cate_item_108701": [
262 "#cate_item_108701 > a"
263 ],
264 "#cate_item_108712": [
265 "#cate_item_108712 > a"
266 ],
267 "#cate_item_-1": [
268 "#cate_item_-1 > a"
269 ]
270 },
271 "scroce": "1248"
272 },
273 {
274 "list_sel": "#friend_link",
275 "list_dom": {
276 "#friend_link > a:nth-child(15)": [
277 "#friend_link > a:nth-child(15)"
278 ],
279 "#friend_link > a:nth-child(16)": [
280 "#friend_link > a:nth-child(16)"
281 ],
282 "#friend_link > a:nth-child(17)": [
283 "#friend_link > a:nth-child(17)"
284 ],
285 "#friend_link > a:nth-child(18)": [
286 "#friend_link > a:nth-child(18)"
287 ],
288 "#friend_link > a:nth-child(1)": [
289 "#friend_link > a:nth-child(1)"
290 ],
291 "#friend_link > a:nth-child(11)": [
292 "#friend_link > a:nth-child(11)"
293 ],
294 "#friend_link > a:nth-child(12)": [
295 "#friend_link > a:nth-child(12)"
296 ],
297 "#friend_link > a:nth-child(3)": [
298 "#friend_link > a:nth-child(3)"
299 ],
300 "#friend_link > a:nth-child(13)": [
301 "#friend_link > a:nth-child(13)"
302 ],
303 "#friend_link > a:nth-child(2)": [
304 "#friend_link > a:nth-child(2)"
305 ],
306 "#friend_link > a:nth-child(14)": [
307 "#friend_link > a:nth-child(14)"
308 ],
309 "#friend_link > a:nth-child(19)": [
310 "#friend_link > a:nth-child(19)"
311 ],
312 "#friend_link > a:nth-child(5)": [
313 "#friend_link > a:nth-child(5)"
314 ],
315 "#friend_link > a:nth-child(4)": [
316 "#friend_link > a:nth-child(4)"
317 ],
318 "#friend_link > a:nth-child(7)": [
319 "#friend_link > a:nth-child(7)"
320 ],
321 "#friend_link > a:nth-child(6)": [
322 "#friend_link > a:nth-child(6)"
323 ],
324 "#friend_link > a:nth-child(10)": [
325 "#friend_link > a:nth-child(10)"
326 ],
327 "#friend_link > a:nth-child(9)": [
328 "#friend_link > a:nth-child(9)"
329 ],
330 "#friend_link > a:nth-child(8)": [
331 "#friend_link > a:nth-child(8)"
332 ]
333 },
334 "scroce": "1197"
335 },
336 {
337 "list_sel": "#side_nav",
338 "list_dom": {
339 "#side_nav > div.w_l:nth-child(16)": [
340 "#side_nav > div.w_l:nth-child(16) > h4",
341 "#site_stats"
342 ],
343 "#side_nav > p.r_l_1:nth-child(7)": [
344 "#side_nav > p.r_l_1:nth-child(7)"
345 ],
346 "#side_nav > p.r_l_2:nth-child(8)": [
347 "#side_nav > p.r_l_2:nth-child(8)"
348 ],
349 "#side_nav > p.r_l_3:nth-child(9)": [
350 "#side_nav > p.r_l_3:nth-child(9)"
351 ],
352 "#side_nav > p.r_l_1:nth-child(5)": [
353 "#side_nav > p.r_l_1:nth-child(5)"
354 ],
355 "#side_nav > p.r_l_3:nth-child(13)": [
356 "#side_nav > p.r_l_3:nth-child(13)"
357 ],
358 "#side_nav > p.r_l_2:nth-child(4)": [
359 "#side_nav > p.r_l_2:nth-child(4)"
360 ],
361 "#side_nav > p.r_l_3:nth-child(19)": [
362 "#side_nav > p.r_l_3:nth-child(19)"
363 ],
364 "#side_nav > p.r_l_3:nth-child(3)": [
365 "#side_nav > p.r_l_3:nth-child(3)"
366 ],
367 "#side_nav > div.w_l:nth-child(6)": [
368 "#side_nav > div.w_l:nth-child(6) > h4",
369 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1) > a",
370 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2) > a",
371 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3) > a",
372 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4) > a",
373 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5) > a",
374 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6) > a"
375 ],
376 "#side_nav > p.r_l_2:nth-child(18)": [
377 "#side_nav > p.r_l_2:nth-child(18)"
378 ],
379 "#side_nav > div.l_s:nth-child(12)": [
380 "#side_nav > div.l_s:nth-child(12)"
381 ],
382 "#cate_title_block": [
383 "#cate_title_title > div.cate_title",
384 "#cate_item_108698 > a",
385 "#cate_item_2 > a",
386 "#cate_item_108701 > a",
387 "#cate_item_108703 > a",
388 "#cate_item_108704 > a",
389 "#cate_item_108705 > a",
390 "#cate_item_108709 > a",
391 "#cate_item_108712 > a",
392 "#cate_item_108724 > a",
393 "#cate_item_4 > a",
394 "#cate_item_0 > a",
395 "#cate_item_-1 > a",
396 "#cate_title_block > div.cate_bottom",
397 "#cate_sub_block",
398 "#cate_title_block > script"
399 ],
400 "#side_nav > div.l_s:nth-child(2)": [
401 "#side_nav > div.l_s:nth-child(2)"
402 ],
403 "#side_nav > p.r_l_1:nth-child(17)": [
404 "#side_nav > p.r_l_1:nth-child(17)"
405 ],
406 "#side_nav > p.r_l_2:nth-child(14)": [
407 "#side_nav > p.r_l_2:nth-child(14)"
408 ],
409 "#side_nav > p.r_l_1:nth-child(15)": [
410 "#side_nav > p.r_l_1:nth-child(15)"
411 ],
412 "#user_stats": [
413 "#user_stats"
414 ],
415 "#side_nav > div.l_s:nth-child(10)": [
416 "#side_nav > div.l_s:nth-child(10)"
417 ]
418 },
419 "scroce": "975"
420 },
421 {
422 "list_sel": "#paging_block > div.pager",
423 "list_dom": {
424 "#paging_block > div.pager > a.p_9.middle": [
425 "#paging_block > div.pager > a.p_9.middle"
426 ],
427 "#paging_block > div.pager > a.p_7.middle": [
428 "#paging_block > div.pager > a.p_7.middle"
429 ],
430 "#paging_block > div.pager > a.p_8.middle": [
431 "#paging_block > div.pager > a.p_8.middle"
432 ],
433 "#paging_block > div.pager > a:nth-child(14)": [
434 "#paging_block > div.pager > a:nth-child(14)"
435 ],
436 "#paging_block > div.pager > a.p_11.middle": [
437 "#paging_block > div.pager > a.p_11.middle"
438 ],
439 "#paging_block > div.pager > a.p_3.middle": [
440 "#paging_block > div.pager > a.p_3.middle"
441 ],
442 "#paging_block > div.pager > a.p_4.middle": [
443 "#paging_block > div.pager > a.p_4.middle"
444 ],
445 "#paging_block > div.pager > a.p_10.middle": [
446 "#paging_block > div.pager > a.p_10.middle"
447 ],
448 "#paging_block > div.pager > a.p_2.middle": [
449 "#paging_block > div.pager > a.p_2.middle"
450 ],
451 "#paging_block > div.pager > a.p_5.middle": [
452 "#paging_block > div.pager > a.p_5.middle"
453 ],
454 "#paging_block > div.pager > a.p_6.middle": [
455 "#paging_block > div.pager > a.p_6.middle"
456 ],
457 "#paging_block > div.pager > a.p_1.current": [
458 "#paging_block > div.pager > a.p_1.current"
459 ],
460 "#paging_block > div.pager > span.ellipsis": [
461 "#paging_block > div.pager > span.ellipsis"
462 ],
463 "#paging_block > div.pager > a.p_200.last": [
464 "#paging_block > div.pager > a.p_200.last"
465 ]
466 },
467 "scroce": "865"
468 },
469 {
470 "list_sel": "#main > div.post_nav_block_wrapper > ul.post_nav_block",
471 "list_dom": {
472 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(1)": [
473 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(1) > a.current_nav"
474 ],
475 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(3)": [
476 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(3) > a"
477 ],
478 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(2)": [
479 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(2) > a"
480 ],
481 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(5)": [
482 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(5) > a"
483 ],
484 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(4)": [
485 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(4) > a"
486 ],
487 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(7)": [
488 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(7) > a"
489 ],
490 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(6)": [
491 "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(6) > a"
492 ]
493 },
494 "scroce": "590"
495 },
496 {
497 "list_sel": "#nav_menu",
498 "list_dom": {
499 "#nav_menu > a:nth-child(3)": [
500 "#nav_menu > a:nth-child(3)"
501 ],
502 "#nav_menu > a:nth-child(2)": [
503 "#nav_menu > a:nth-child(2)"
504 ],
505 "#nav_menu > a:nth-child(5)": [
506 "#nav_menu > a:nth-child(5)"
507 ],
508 "#nav_menu > a:nth-child(4)": [
509 "#nav_menu > a:nth-child(4)"
510 ],
511 "#nav_menu > a:nth-child(1)": [
512 "#nav_menu > a:nth-child(1)"
513 ],
514 "#nav_menu > a:nth-child(7)": [
515 "#nav_menu > a:nth-child(7)"
516 ],
517 "#nav_menu > a:nth-child(6)": [
518 "#nav_menu > a:nth-child(6)"
519 ],
520 "#nav_menu > a:nth-child(9)": [
521 "#nav_menu > a:nth-child(9)"
522 ],
523 "#nav_menu > a:nth-child(8)": [
524 "#nav_menu > a:nth-child(8)"
525 ]
526 },
527 "scroce": "486"
528 },
529 {
530 "list_sel": "#side_nav > div.w_l:nth-child(6) > ul",
531 "list_dom": {
532 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3)": [
533 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3) > a"
534 ],
535 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2)": [
536 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2) > a"
537 ],
538 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1)": [
539 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1) > a"
540 ],
541 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6)": [
542 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6) > a"
543 ],
544 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5)": [
545 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5) > a"
546 ],
547 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4)": [
548 "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4) > a"
549 ]
550 },
551 "scroce": "486"
552 },
553 {
554 "list_sel": "#headline_block > ul",
555 "list_dom": {
556 "#headline_block > ul > li:nth-child(4)": [
557 "#headline_block > ul > li:nth-child(4) > a:nth-child(1)",
558 "#headline_block > ul > li:nth-child(4) > a.right_more"
559 ],
560 "#headline_block > ul > li.editor_pick": [
561 "#editor_pick_count",
562 "#headline_block > ul > li.editor_pick > a.right_more"
563 ],
564 "#headline_block > ul > li:nth-child(3)": [
565 "#headline_block > ul > li:nth-child(3) > a:nth-child(1)",
566 "#headline_block > ul > li:nth-child(3) > a.right_more"
567 ],
568 "#headline_block > ul > li:nth-child(2)": [
569 "#headline_block > ul > li:nth-child(2) > a:nth-child(1)",
570 "#headline_block > ul > li:nth-child(2) > a.right_more"
571 ]
572 },
573 "scroce": "407"
574 },
575 {
576 "list_sel": "#header",
577 "list_dom": {
578 "#header > p.h_r_3:nth-child(1)": [
579 "#header > p.h_r_3:nth-child(1)"
580 ],
581 "#header > p.h_r_2:nth-child(6)": [
582 "#header > p.h_r_2:nth-child(6)"
583 ],
584 "#header > p.h_r_1:nth-child(3)": [
585 "#header > p.h_r_1:nth-child(3)"
586 ],
587 "#header > p.h_r_2:nth-child(2)": [
588 "#header > p.h_r_2:nth-child(2)"
589 ],
590 "#header > p.h_r_1:nth-child(5)": [
591 "#header > p.h_r_1:nth-child(5)"
592 ],
593 "#header > p.h_r_3:nth-child(7)": [
594 "#header > p.h_r_3:nth-child(7)"
595 ],
596 "#header_block": [
597 "#logo > h1 > a > img",
598 "#header_block > div.clear"
599 ]
600 },
601 "scroce": "335"
602 }
603 ],
604 "ifrs": []
605 }
View Code
10.上述第8步标记文件效果:
红色虚线框起来的是返回的json结果中list中的list_sel选择器选中的元素
分析结果统计:
处理了将近1万的网站发现,大致的网页列表结构可以发现,平时时间大致在2-3s左右,因为用的是jsoup访问的网页,包含了网页响应的时间,时间复杂度待优化,
分析结果对于一些比较复杂乱的网页支持有待加强,代码写的比较乱,有待优化,应该会有更好的处理方式,还请指教,相互学习交流。
转载请注明出处:https://www.cnblogs.com/jstarseven/p/9501210.html
源码地址:https://github.com/jstarseven/list-autofire
-END-
以上是 java实现网页结构分析列表发现 的全部内容, 来源链接: utcz.com/z/391781.html