2017년 1월 12일 목요일

web crawler with java

web crawler with java


https://ko.wikipedia.org/wiki/%EC%9B%B9_%ED%81%AC%EB%A1%A4%EB%9F%AC
웹 크롤러(web crawler)는 조직적, 자동화된 방법으로 월드 와이드 웹을 탐색하는 컴퓨터 프로그램입니다.. 웹 크롤러에 대한 다른 용어로는 앤트(ants), 자동 인덱서(automatic indexers), 봇(bots), 웜(worms), 웹 스파이더(web spider), 웹 로봇(web robot) 등이 있습니다.


java로 만든 일단 소스를 아래 공유합니다.
동작은 정해진 http주소를 받아서 web page를 가져와서 파일로 저장하는 소스 입니다. depth를 정해서 html페이지내에 여러개의 링크가 있으면 각 링크들을 방문해서 저장을 하게 됩니다.
결국 웹 페이지를 가져와 저장하는 부분밖에는 없지만 가져온 페이지를 파싱해서 필요한 데이터를 추출하고 분석하는 부분은 용도에 따라 다를 수 있으니 이용자의 몫으로 남겨 두겠습니다.
사용라이브러리는 아래와 같습니다.
httpclient 4.5
commons-cli-1.3.1.jar
jericho-html-3.4.jar

설명은 나중에 추가로 작성하도록 하겠습니다.

소스
package prj.dish;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.utils.URIUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import net.htmlparser.jericho.Config;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.LoggerProvider;
import net.htmlparser.jericho.Source;

public class Webcrawler {
 private int maxDepth = 1;
 private int maxHostChange = 1;
 private String savePath;
 private String host;
 boolean DOMAIN_CHANGE = true;
 byte[] htmlByte = null;
 HashSet<String> visited = new HashSet<String>();
 CloseableHttpClient httpclient = HttpClients.createDefault();


 public static void main(String[] args) {
  System.out.println("Welcome !! Webcrawler");
  Config.LoggerProvider=LoggerProvider.DISABLED;
  System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); 
  if(args.length >= 1){
   Options options = new Options();

   Option savepath = new Option("s", "savepath", true, "input save folder file path");
   savepath.setRequired(true);
   options.addOption(savepath);

   Option url = new Option("u", "url", true, "url ex) http://www.daum.net");
   url.setRequired(true);
   options.addOption(url);

   Option depth = new Option("d", "depth", true, "max depth");
   depth.setRequired(false);
   options.addOption(depth);

   Option changehostdepth = new Option("c", "changehostdepth", true, "change host depth");
   changehostdepth.setRequired(false);
   options.addOption(changehostdepth);

   CommandLineParser parser = new DefaultParser();
   HelpFormatter formatter = new HelpFormatter();
   CommandLine cmd;

   try {
    cmd = parser.parse(options, args);
   } catch (ParseException e) {
    System.out.println(e.getMessage());
    formatter.printHelp("Webcrawler", options);
    System.exit(1);
    return;
   }

   String saveFilePath = cmd.getOptionValue("savepath");
   String urlPath = cmd.getOptionValue("url");
   String depthParam = cmd.getOptionValue("depth");
   if(depthParam==null || depthParam.isEmpty()) depthParam = "2";
   String changehostdepthdepthParam = cmd.getOptionValue("changehostdepth");
   if(changehostdepthdepthParam==null || changehostdepthdepthParam.isEmpty()) changehostdepthdepthParam = "1";
   System.out.println(urlPath);
   Webcrawler crawler;
   crawler = new Webcrawler();
   crawler.setSavePath(saveFilePath);
   crawler.setMaxDepth(Integer.valueOf(depthParam));
   crawler.setMaxHostChange(Integer.valueOf(changehostdepthdepthParam));
   crawler.run(urlPath);
  }
  System.out.println("End Webcrawler");
 }

 private void run(String string) {
  host = string;
  connect( host, "/", 0, 0);
 }
 public String getString() {
  try {
   return new String(htmlByte, "UTF-8");
  } catch (UnsupportedEncodingException e) {
   e.printStackTrace();
  }
  return null;
 }
 private String getHttp(String url) throws IOException, URISyntaxException{
  String ret=null;
  try {
   HttpGet httpGet = new HttpGet(url);
   HttpClientContext context = HttpClientContext.create();
   httpGet.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip");
   CloseableHttpResponse response = httpclient.execute(httpGet,context);
   try {
    System.out.println(response.getStatusLine());
    HttpEntity entity = response.getEntity();

    Header contentEncoding = response.getFirstHeader("Content-Encoding");
    if (contentEncoding != null && contentEncoding.getValue().equalsIgnoreCase("gzip")) {
     System.out.println("gziped");
     htmlByte = inputStreamToByte( new GZIPInputStream(entity.getContent()));
    }else {
     htmlByte = inputStreamToByte(entity.getContent());
    }

    HttpHost target = context.getTargetHost();
    List<URI> redirectLocations = context.getRedirectLocations();
    URI location = URIUtils.resolve(httpGet.getURI(), target, redirectLocations);
    System.out.println("Final HTTP location: " + location.toASCIIString());
    ret = location.toASCIIString();
   } finally {
    response.close();
   }
  } finally {
   //httpclient.close();
  }
  return ret;
 }
 private byte[] inputStreamToByte(InputStream in)
 {
  final int BUF_SIZE = 1024;
  ByteArrayOutputStream out = new ByteArrayOutputStream();
  byte[] buffer = new byte[BUF_SIZE];
  try {
   int length;
   while ((length = in.read(buffer)) != -1) out.write(buffer, 0, length);
  } catch (IOException e) {
   e.printStackTrace();
   return null;
  }
  return out.toByteArray();
 }
 private void connect(String lasturl, String addurl, int depth, int hostchange) {
  Source source = null;
  String newurl = null;
  int hostchanged = 0;

  if(addurl.startsWith("http://") || addurl.startsWith("https://")){
   hostchanged = 1;
   if( maxHostChange <= hostchange+hostchanged) return;
  }
  if( maxDepth <= depth ){
   return;
  }
  try {
   //if(DOMAIN_CHANGE){
   lasturl = calcNextUrl(lasturl, addurl);
   //}else{
   //lasturl = urlChg(host, lasturl, addurl);
   //}
   System.out.println("Get:["+depth+"]:"+lasturl);
   if( !visited.contains(lasturl) ){
    visited.add(lasturl);
   }else{
    System.out.println("visited !");
    return;
   }
   //source=new Source(new URL(lasturl));
   newurl = getHttp(lasturl);
   //fileSave(savePath + changeFileName(lasturl)+".htm",getString());
   fileSave(savePath + changeFileName(lasturl)+".htm",htmlByte);
   source=new Source(getString());

  } catch (Exception e) {
   e.printStackTrace();
   return;
  }
  //System.out.println(source.getRenderer().toString());
  List <Element> elements = source.getAllElements("a");
  System.out.println("Len:("+htmlByte.length+"), A tag("+elements.size()+")");
  for(int i = 0 ; i < elements.size(); i++){
   Element ele = elements.get(i);
   String href = ele.getAttributeValue("href");
   if(href==null || href.isEmpty()) continue;
   if(!DOMAIN_CHANGE){
    if(href.startsWith("http://") || href.startsWith("https://")){
     continue;
    }
   }
   if(href.startsWith("javascript:")){
    continue;
   }else if(href.contains("#")){
    continue;
   }else if(href.startsWith("<")){
    continue;
   }
   connect(newurl,href,depth+1,hostchange+hostchanged);
  }
 }

 private void fileSave(String name, byte[] htmlByte) {
  FileOutputStream stream = null;
  try{
   stream = new FileOutputStream(name);
   stream.write(htmlByte);
  } catch (Exception e) {
  } finally {
   try {
    stream.close();
   } catch (IOException e) {
   }
  }
 }

 private String changeFileName(String lasturl) {
  lasturl=lasturl.replace('?', '_');
  lasturl=lasturl.replace('*', '_');
  lasturl=lasturl.replace('%', '_');
  lasturl=lasturl.replace('.', '_');
  lasturl=lasturl.replace('/', '_');
  lasturl=lasturl.replace('\\', '_');
  lasturl=lasturl.replace('\"', '_');
  lasturl=lasturl.replace('\'', '_');
  lasturl=lasturl.replace('|', '_');
  lasturl=lasturl.replace('+', '_');
  lasturl=lasturl.replace('-', '_');
  lasturl=lasturl.replace(':', '_');
  return lasturl;
 }

 private void setMaxDepth(int i) {
  maxDepth = i;
 }

 private void setMaxHostChange(int i) {
  maxHostChange = i;
 }

 private void setSavePath(String string) {
  savePath = string;
  if(!savePath.endsWith("/")) savePath=savePath+"/";
  createDirectoryIfNeeded(string);
  String timeStamp = new SimpleDateFormat("yyyy.MM.dd.HH.mm").format(new Date());
  savePath=savePath+timeStamp;
  createDirectoryIfNeeded(savePath);
  if(!savePath.endsWith("/")) savePath=savePath+"/";
 }

 private void createDirectoryIfNeeded(String directoryName)
 {
  File theDir = new File(directoryName); 
  if (!theDir.exists())
   theDir.mkdirs();
 }

 private void Webcrawler() {
 }

 public static String calcNextUrl(String thisurl, String add)
 {
  System.out.println("This:["+thisurl + "]Add:["+add+"]");
  URI thisuri = URI.create(thisurl);
  String data = thisuri.getScheme() + "://" + thisuri.getHost();
  if(thisuri.getPort()!=-1) data=":"+thisuri.getPort();
  if(add.startsWith("/")) data=data+add;
  else if(add.startsWith("http")) data=add;
  else {
   data=thisurl;
   if(data.endsWith("/")) data=data+add;
   else data=data+"/"+add;
  }

  URI returi = URI.create(data);
  returi = returi.normalize();
  if( !returi.toString().startsWith("http") ){
   System.out.println("Error");
  }
  return returi.toString();
 }

 public static void fileSave(String name,String data)
 {
  try {
   File newTextFile = new File(name);
   FileWriter fw = new FileWriter(newTextFile);
   fw.write(data);
   fw.close();
  } catch (IOException iox) {
   iox.printStackTrace();
  }
 }
}


실행할때 아래와 같이 옵션을 주어야 합니다.
-u 는 접속시도하는 url이 됩니다.
-s 는 저장할 폴더 이름입니다.
-d 이건 최대 html의 depth를 의미합니다. 숫자가 크면 링크를 많이 타고 갑니다. 따라서 적당하게 적습니다.
-c 링크를 타고 갈때 다른 site가 나올수 있는데 몇번이나 허용할지 최대 허용 갯수를 의미합니다.

-u http://www.daum.net -s save -d 2 -c 1

실행화면
Welcome !! Webcrawler
http://www.daum.net
This:[http://www.daum.net]Add:[/]
Get:[0]:http://www.daum.net/
HTTP/1.1 200 OK
Final HTTP location: http://www.daum.net/
Len:(154748), A tag(170)
This:[http://www.daum.net/]Add:[/doc/top_accessibility.html]
Get:[1]:http://www.daum.net/doc/top_accessibility.html
HTTP/1.1 200 OK
Final HTTP location: http://www.daum.net/doc/top_accessibility.html
Len:(28899), A tag(40)
End Webcrawler

코드 설명
http://swlock.blogspot.com/2017/01/web-crawler-with-java_20.html


댓글 없음:

댓글 쓰기