一个java版本的简单邮箱小爬虫

java

//趁着有空回头复习了一把正则表达式
/*
  以下代码以百度某个贴吧的 URL 作为源,实现了读取 EmailAddress 并写入文件保存起来的两个功能,如果要爬取其它信息,可以改写正则实现相应功能
  要点看引入的包可知:
     1.应用到 IO 读写缓冲字符流
     2.应用到正则表达式
     3.URL 对象获取网页信息
     4.util 包的集合框架 ArrayList
*/
import java.io.*;

import java.util.regex.*;

import java.net.*;

import java.util.*;

class Spider{

public static void main(String [] args) throws Exception{

URL url=new URL("http://tieba.baidu.com/p/2314539885");

//String [] emailAddress=

ArrayList<String> emailList=getEmailByURL(url);

for(String emailAddress:emailList){

System.out.println(emailAddress);

}

String qualifiedName="c://users//ghc//desktop//test//emailAddress.txt";

writeEmailToFile(qualifiedName,emailList);

}

// Read html from url

public static ArrayList<String> getEmailByURL(URL url) throws Exception{

URLConnection urlconn=url.openConnection();

BufferedReader bufreader =new BufferedReader(new InputStreamReader(urlconn.getInputStream()));

//regex match pattern

String regex="\\w+@\\w+(\\.\\w+)+";

Pattern p=Pattern.compile(regex);

ArrayList<String> emailList=new ArrayList<String>();

String line=null;

while((line=bufreader.readLine())!=null){

Matcher m=p.matcher(line);

// Start to iterator the one matches

while(m.find()){

emailList.add(m.group());

//System.out.println(m.group());

}

//System.out.println(line);

}

return emailList;

}

public static void writeEmailToFile(String qualifiedName,ArrayList<String> emailList) throws Exception{

BufferedWriter bufwriter=new BufferedWriter(new FileWriter(qualifiedName));

for(int i=0;i<emailList.size();i++){

bufwriter.write(emailList.get(i));

bufwriter.newLine();

bufwriter.flush();

}

}

}


/*读取键盘输入的 三种形式 */

import java.io.*;

import java.util.*;

import java.util.Scanner;

class MyTest{

public static void main(String [] args) throws Exception{

//方法一

Scanner scanner =new Scanner(System.in);

String inputStr=scanner.nextLine();

System.out.println(inputStr);

//方法二

BufferedReader bufreader=new BufferedReader(new InputStreamReader(System.in));

String line=null;

while((line=bufreader.readLine())!=null){

System.out.println(line);

}

}

}

/* 把叠词 简化 */

class AbrreviateDemo{

public static void main(String [] args){

String str="II...LLL...ove..ee.....you!";

String regex="\\.+";

String replaceStr="";

str=retriveStr(str,regex,replaceStr);

regex="(.)\\1+";

replaceStr="$1";

str=retriveStr(str,regex,replaceStr);

System.out.println(str);

}

public static String retriveStr(String str,String regex,String replaceStr){

return str.replaceAll(regex,replaceStr);

}

}

/* 将一堆杂乱的 IP 地址进行排序 */

import java.util.*;

class SortIP{

public static void main(String [] args){

String IP="192.168.0.5 2.2.3.4 127.0.0.1";

printAfterSort(IP);

}

public static void printAfterSort(String str){

String regex="(0*\\d+)";

str=str.replaceAll(regex,"00$1");

regex="0*(\\d{3})";

str=str.replaceAll(regex,"$1");

System.out.println(str);

regex=" +";

String [] strArray=str.split(regex);

Arrays.sort(strArray);

for(int i=0;i<strArray.length;i++){

System.out.println(strArray[i].replaceAll("0*(\\d+)","$1"));

}

//System.out.println(str);

}

}

/* 邮箱地址校验 */

class checkMailDemo{

public static void main(String [] args){

String str="liyu@gchchina.com.cn";

System.out.println("result: "+checkMail(str));

//String regex="";

}

public static boolean checkMail(String str){

String regex="[a-zA-Z0-9_]+[@][a-zA-Z0-9]+(\\.[a-zA-Z]+){1,3}";

regex="\\w+@\\w+(\\.\\w+){1,3}";

return str.matches(regex);

}

}

/* 从一堆杂乱的字符串中获取需要的手机号码 */

import java.util.regex.*;

class RegexDemo{

public static void main(String [] args){

String str="1afasdf13874057617weojfjlj";

String regex="[1-9][3,5,8]\\d{9}";

retriveStr(str,regex);

}

public static void retriveStr(String str,String regex){

Pattern p=Pattern.compile(regex);

Matcher m=p.matcher(str);

while(m.find()){

String tempstr=m.group();

System.out.println(tempstr);

}

}

}

/* 读取键盘标准输入流并大写方式打印到控制台 */

import java.io.*;

import java.util.*;

class UpercaseSystemIn{

public static void main(String [] args) throws IOException{

InputStream in=System.in;

doUpcaseReadIn(in);

}

public static void doUpcaseReadIn(InputStream in) throws IOException{

BufferedReader bufr=new BufferedReader(new InputStreamReader(in));

String str=null;

while((str=bufr.readLine())!=null){

System.out.println(str.toUpperCase());

if(str.equalsIgnoreCase("exit")) System.exit(0); //break

}

}

}

/* 读取某个贴吧邮箱地址并打印到控制台 注意这里的正则*/

import java.net.*;

import java.io.*;

import java.util.*;

import java.util.regex.*;

class SpiderTest{

public static void main(String [] args) throws Exception{

URL url=new URL("http://tieba.baidu.com/p/2314539885");

getEmailAddressFromURL(url);

}

public static void getEmailAddressFromURL(URL url) throws Exception{

URLConnection urlconn=url.openConnection();

BufferedReader bufreader = new BufferedReader(new InputStreamReader(urlconn.getInputStream()));

String line=null;

String regex="\\w+@\\w+(\\.\\w+)+";

Pattern p=Pattern.compile(regex);

while ((line=bufreader.readLine())!=null){

Matcher m = p.matcher(line);

while(m.find()){

System.out.println(m.group());

}

// System.out.println(line);

}

}

}

/* 实现本地二进制文件拷贝 */

import java.io.*;

class CopyImg{

public static void main(String [] args){

BufferedInputStream bufinps=null;

BufferedOutputStream bufotps=null;

try{

bufinps=new BufferedInputStream(new FileInputStream("psb.jpg"));

bufotps=new BufferedOutputStream(new FileOutputStream("psb_copy.jpg"));

byte [] buf=new byte[8192];

int len=0;

while((len=bufinps.read(buf))>0){

bufotps.write(buf,0,len);

//bufotps.flush();

}

}

catch(IOException ioe){

ioe.printStackTrace();

}

finally{

if(bufinps!=null)

try{

bufinps.close();

}

catch(IOException ioe){

ioe.printStackTrace();

}

if(bufotps!=null)

try{

bufotps.close();

}

catch(IOException ioe){

ioe.printStackTrace();

}

}

}

}

/* 从某个网页爬取符合规则的邮箱地址并保存到本地磁盘路径下 */

import java.io.*;

import java.util.regex.*;

import java.net.*;

import java.util.*;

class Spider{

public static void main(String [] args) throws Exception{

URL url=new URL("http://tieba.baidu.com/p/2314539885");

//String [] emailAddress=

ArrayList<String> emailList=getEmailByURL(url);

for(String emailAddress:emailList){

System.out.println(emailAddress);

}

String qualifiedName="c://users//ghc//desktop//test//emailAddress.txt";

writeEmailToFile(qualifiedName,emailList);

}

// Read html from url

public static ArrayList<String> getEmailByURL(URL url) throws Exception{

URLConnection urlconn=url.openConnection();

BufferedReader bufreader =new BufferedReader(new InputStreamReader(urlconn.getInputStream()));

//regex match pattern

String regex="\\w+@\\w+(\\.\\w+)+";

Pattern p=Pattern.compile(regex);

ArrayList<String> emailList=new ArrayList<String>();

String line=null;

while((line=bufreader.readLine())!=null){

Matcher m=p.matcher(line);

// Start to iterator the one matches

while(m.find()){

emailList.add(m.group());

//System.out.println(m.group());

}

//System.out.println(line);

}

return emailList;

}

public static void writeEmailToFile(String qualifiedName,ArrayList<String> emailList) throws Exception{

BufferedWriter bufwriter=new BufferedWriter(new FileWriter(qualifiedName));

for(int i=0;i<emailList.size();i++){

bufwriter.write(emailList.get(i));

bufwriter.newLine();

bufwriter.flush();

}

}

}

/* 从某个网页爬取图片的 URL 地址然后 进行 下载到本地磁盘路径 基本功能已经实现,但是正则需要自行调整 */

import java.net.*;

import java.io.*;

import java.util.regex.*;

import java.util.*;

class ImgSpider{

public static void main(String [] args){

saveImgFromURL("http://image.baidu.com/","c:/users/ghc/desktop/test/");

System.gc();

}

public static boolean downLoadImg(String line,String path){

boolean flag=true;

FileOutputStream fos=null;

BufferedInputStream bufinpts=null;

BufferedOutputStream bufopts=null;

path=path.replace("<","");

/* System.out.println(line);

System.out.println(path); */

try{

bufinpts=new BufferedInputStream((new URL(line)).openConnection().getInputStream());

fos=new FileOutputStream(path);

bufopts=new BufferedOutputStream(fos);

byte [] buf=new byte[1024];

int len=-1;

while((len=bufinpts.read(buf))!=-1){

bufopts.write(buf,0,buf.length);

}

}

catch(IOException ioe){

ioe.printStackTrace();

flag=false;

}

finally{

if(bufopts!=null)

try{

bufopts=null;

bufopts.close();

}

catch(IOException ioe){

ioe.printStackTrace();

}

if(fos!=null)

try{

fos=null;

fos.close();

}

catch(IOException ioe){

ioe.printStackTrace();

}

}

return flag;

}

public static boolean saveImgFromURL(String urlStr,String folder){

boolean flag=true;

URL url=null;

//InputStream in=null;

String line=null;

BufferedReader bufr=null;

Pattern p=null;

Matcher m=null;

ArrayList<String> imgList=null;

try{

url=new URL(urlStr);

URLConnection urlconn=url.openConnection();

bufr=new BufferedReader(new InputStreamReader(urlconn.getInputStream()));

imgList=new ArrayList<String>();

String regex="<img.*src=(.*?)[^>]*?>";

p=Pattern.compile(regex);

while((line=bufr.readLine())!=null){

m=p.matcher(line);

while(m.find()){

System.out.println(m.group());

imgList.add(m.group());

}

//System.out.println(line);

}

Iterator<String> it=imgList.iterator();

while(it.hasNext()){

line=it.next();

folder+=line.substring(line.lastIndexOf("/",2) + 1,

3)+".png";

//http://www.jb51.net/images/logo.gif

m=Pattern.compile("http://(\\w+\\.)+[a-z]+/images/(\\w+\\.)+[a-z]{3}").matcher(line);

while(m.find()){

line=m.group();

//System.out.println(line);

downLoadImg(line,folder);

}

}

}

catch(MalformedURLException mfe){

mfe.printStackTrace();

flag=false;

}

catch(IOException ioe){

ioe.printStackTrace();

flag=false;

}

finally{

if (bufr!=null)

try{

bufr=null;

bufr.close();

}

catch(IOException ie){ ie.printStackTrace();

}

}

return flag;

}

}

/*正则 小练习 */

class Demo{

public static void main(String [] args){

String qq="1212345";

boolean checkResult=checkQQ(qq);

System.out.println(checkResult ? qq+" is right": qq+" is wrong!!!");

String telnumber="15974097817";

checkResult=checkTel(telnumber);

System.out.println(checkResult ? telnumber+" is right": telnumber+" is wrong!!!");

String path="c:\\users\\frank\\abqqcdkkkefghhijkkkkkl.txt";

String regex="(.)\\1+"; //叠词切割注意引入组的概念,\n 代表引用第几组 + 出现1次或多次 qq 或者 kkk 均会被当作切割符

printAfterSplit(path,regex);

System.out.println("=================");

regex="\\.";

printAfterSplit(path,regex);

String str="abcddeffffg";

regex="(.)\\1{3,}";

String replaceStr="$1";

printAfterReplaceStr(str,regex,replaceStr);

}

// 以下两个均是正则匹配 校验字符串的函数

public static boolean checkTel(String telnumber){

String regex="[1][3,5,8]\\d{9}";

return telnumber.matches(regex);

}

public static boolean checkQQ(String qq){

//boolean result=false;

String regex="[1-9][0-9]{4,14}";

regex="[1-9]\\d{4,14}";

return qq.matches(regex);

/* int len=qq.length();

if(len<5 || len>15 || qq.startsWith("0")){

System.out.println("length or startWith issue!!!");

//return result;

}

else {

char [] ary=qq.toCharArray();

for(int i=0;i<ary.length;i++){

if(!(ary[i]>='0' && ary[i]<='9'))

{

System.out.println("not between 0 and 9 !!!");

break;

//return result;

}

else

result=true;

}

} */

//return result;

}

// 以下两个均是正则 切割字符串的函数

public static void printAfterSplit(String path,String regex){

String [] ary=path.split(regex);

for(String s:ary){

System.out.println(s);

}

}

// 以下两个均是自定义正则替换字符串函数

public static void printAfterReplaceStr(String str,String regex,String replaceStr){

String resultStr=str.replaceAll(regex,replaceStr);

System.out.println(resultStr);

}

}

  

  

以上是 一个java版本的简单邮箱小爬虫 的全部内容, 来源链接: utcz.com/z/390769.html

回到顶部