网易微博抓取信息抓取 程序代码
主函数:
package com;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.net.HttpURLConnection; import java.net.URL;
import java.util.Calendar;
import java.util.TimerTask;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.util.NodeList;
public class SohuTask extends TimerTask {
public void run() {
try {
sohuParser();
Thread.sleep(10);
} catch (InterruptedException e){
e.printStackTrace();
}
}
public void sohuParser(){
try{
String url = "
method=click&f=ws_gs_id";
//BufferedReader in1=new BufferedReader(new
InputStreamReader(System.in));
//url=in1.readLine();
Parser parser = new Parser( (HttpURLConnection) (new
URL(url)).openConnection() );
Parser parser1= new Parser( (HttpURLConnection) (new
URL(url)).openConnection() );
// 这里是控制测试的局部,后面的例子修改的就是这个地方
。
NodeFilter tag =new TagNameFilter("p");
NodeFilter tag1 =new TagNameFilter("a");
NodeFilter filter = new HasAttributeFilter
("class","message");
NodeFilter filter1=new HasAttributeFilter
("class","time");
NodeFilter last =new AndFilter(tag,filter);
NodeFilter last1 =new AndFilter(tag1,filter1);
NodeList nodes = parser.extractAllNodesThatMatch
(last);
NodeList nodes1 = parser1.extractAllNodesThatMatch
(last1);
if(nodes!=null) {
for (int i = 0; i