web crawler with java
https://ko.wikipedia.org/wiki/%EC%9B%B9_%ED%81%AC%EB%A1%A4%EB%9F%AC
웹 크롤러(web crawler)는 조직적, 자동화된 방법으로 월드 와이드 웹을 탐색하는 컴퓨터 프로그램입니다.. 웹 크롤러에 대한 다른 용어로는 앤트(ants), 자동 인덱서(automatic indexers), 봇(bots), 웜(worms), 웹 스파이더(web spider), 웹 로봇(web robot) 등이 있습니다.
java로 만든 일단 소스를 아래 공유합니다.
동작은 정해진 http주소를 받아서 web page를 가져와서 파일로 저장하는 소스 입니다. depth를 정해서 html페이지내에 여러개의 링크가 있으면 각 링크들을 방문해서 저장을 하게 됩니다.
결국 웹 페이지를 가져와 저장하는 부분밖에는 없지만 가져온 페이지를 파싱해서 필요한 데이터를 추출하고 분석하는 부분은 용도에 따라 다를 수 있으니 이용자의 몫으로 남겨 두겠습니다.
사용라이브러리는 아래와 같습니다.
httpclient 4.5
commons-cli-1.3.1.jar
jericho-html-3.4.jar
설명은 나중에 추가로 작성하도록 하겠습니다.
소스
package prj.dish; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.zip.GZIPInputStream; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpHeaders; import org.apache.http.HttpHost; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.client.utils.URIUtils; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import net.htmlparser.jericho.Config; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.LoggerProvider; import net.htmlparser.jericho.Source; public class Webcrawler { private int maxDepth = 1; private int maxHostChange = 1; private String savePath; private String host; boolean DOMAIN_CHANGE = true; byte[] htmlByte = null; HashSet<String> visited = new HashSet<String>(); CloseableHttpClient httpclient = HttpClients.createDefault(); public static void main(String[] args) { System.out.println("Welcome !! Webcrawler"); Config.LoggerProvider=LoggerProvider.DISABLED; System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); if(args.length >= 1){ Options options = new Options(); Option savepath = new Option("s", "savepath", true, "input save folder file path"); savepath.setRequired(true); options.addOption(savepath); Option url = new Option("u", "url", true, "url ex) http://www.daum.net"); url.setRequired(true); options.addOption(url); Option depth = new Option("d", "depth", true, "max depth"); depth.setRequired(false); options.addOption(depth); Option changehostdepth = new Option("c", "changehostdepth", true, "change host depth"); changehostdepth.setRequired(false); options.addOption(changehostdepth); CommandLineParser parser = new DefaultParser(); HelpFormatter formatter = new HelpFormatter(); CommandLine cmd; try { cmd = parser.parse(options, args); } catch (ParseException e) { System.out.println(e.getMessage()); formatter.printHelp("Webcrawler", options); System.exit(1); return; } String saveFilePath = cmd.getOptionValue("savepath"); String urlPath = cmd.getOptionValue("url"); String depthParam = cmd.getOptionValue("depth"); if(depthParam==null || depthParam.isEmpty()) depthParam = "2"; String changehostdepthdepthParam = cmd.getOptionValue("changehostdepth"); if(changehostdepthdepthParam==null || changehostdepthdepthParam.isEmpty()) changehostdepthdepthParam = "1"; System.out.println(urlPath); Webcrawler crawler; crawler = new Webcrawler(); crawler.setSavePath(saveFilePath); crawler.setMaxDepth(Integer.valueOf(depthParam)); crawler.setMaxHostChange(Integer.valueOf(changehostdepthdepthParam)); crawler.run(urlPath); } System.out.println("End Webcrawler"); } private void run(String string) { host = string; connect( host, "/", 0, 0); } public String getString() { try { return new String(htmlByte, "UTF-8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return null; } private String getHttp(String url) throws IOException, URISyntaxException{ String ret=null; try { HttpGet httpGet = new HttpGet(url); HttpClientContext context = HttpClientContext.create(); httpGet.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip"); CloseableHttpResponse response = httpclient.execute(httpGet,context); try { System.out.println(response.getStatusLine()); HttpEntity entity = response.getEntity(); Header contentEncoding = response.getFirstHeader("Content-Encoding"); if (contentEncoding != null && contentEncoding.getValue().equalsIgnoreCase("gzip")) { System.out.println("gziped"); htmlByte = inputStreamToByte( new GZIPInputStream(entity.getContent())); }else { htmlByte = inputStreamToByte(entity.getContent()); } HttpHost target = context.getTargetHost(); List<URI> redirectLocations = context.getRedirectLocations(); URI location = URIUtils.resolve(httpGet.getURI(), target, redirectLocations); System.out.println("Final HTTP location: " + location.toASCIIString()); ret = location.toASCIIString(); } finally { response.close(); } } finally { //httpclient.close(); } return ret; } private byte[] inputStreamToByte(InputStream in) { final int BUF_SIZE = 1024; ByteArrayOutputStream out = new ByteArrayOutputStream(); byte[] buffer = new byte[BUF_SIZE]; try { int length; while ((length = in.read(buffer)) != -1) out.write(buffer, 0, length); } catch (IOException e) { e.printStackTrace(); return null; } return out.toByteArray(); } private void connect(String lasturl, String addurl, int depth, int hostchange) { Source source = null; String newurl = null; int hostchanged = 0; if(addurl.startsWith("http://") || addurl.startsWith("https://")){ hostchanged = 1; if( maxHostChange <= hostchange+hostchanged) return; } if( maxDepth <= depth ){ return; } try { //if(DOMAIN_CHANGE){ lasturl = calcNextUrl(lasturl, addurl); //}else{ //lasturl = urlChg(host, lasturl, addurl); //} System.out.println("Get:["+depth+"]:"+lasturl); if( !visited.contains(lasturl) ){ visited.add(lasturl); }else{ System.out.println("visited !"); return; } //source=new Source(new URL(lasturl)); newurl = getHttp(lasturl); //fileSave(savePath + changeFileName(lasturl)+".htm",getString()); fileSave(savePath + changeFileName(lasturl)+".htm",htmlByte); source=new Source(getString()); } catch (Exception e) { e.printStackTrace(); return; } //System.out.println(source.getRenderer().toString()); List <Element> elements = source.getAllElements("a"); System.out.println("Len:("+htmlByte.length+"), A tag("+elements.size()+")"); for(int i = 0 ; i < elements.size(); i++){ Element ele = elements.get(i); String href = ele.getAttributeValue("href"); if(href==null || href.isEmpty()) continue; if(!DOMAIN_CHANGE){ if(href.startsWith("http://") || href.startsWith("https://")){ continue; } } if(href.startsWith("javascript:")){ continue; }else if(href.contains("#")){ continue; }else if(href.startsWith("<")){ continue; } connect(newurl,href,depth+1,hostchange+hostchanged); } } private void fileSave(String name, byte[] htmlByte) { FileOutputStream stream = null; try{ stream = new FileOutputStream(name); stream.write(htmlByte); } catch (Exception e) { } finally { try { stream.close(); } catch (IOException e) { } } } private String changeFileName(String lasturl) { lasturl=lasturl.replace('?', '_'); lasturl=lasturl.replace('*', '_'); lasturl=lasturl.replace('%', '_'); lasturl=lasturl.replace('.', '_'); lasturl=lasturl.replace('/', '_'); lasturl=lasturl.replace('\\', '_'); lasturl=lasturl.replace('\"', '_'); lasturl=lasturl.replace('\'', '_'); lasturl=lasturl.replace('|', '_'); lasturl=lasturl.replace('+', '_'); lasturl=lasturl.replace('-', '_'); lasturl=lasturl.replace(':', '_'); return lasturl; } private void setMaxDepth(int i) { maxDepth = i; } private void setMaxHostChange(int i) { maxHostChange = i; } private void setSavePath(String string) { savePath = string; if(!savePath.endsWith("/")) savePath=savePath+"/"; createDirectoryIfNeeded(string); String timeStamp = new SimpleDateFormat("yyyy.MM.dd.HH.mm").format(new Date()); savePath=savePath+timeStamp; createDirectoryIfNeeded(savePath); if(!savePath.endsWith("/")) savePath=savePath+"/"; } private void createDirectoryIfNeeded(String directoryName) { File theDir = new File(directoryName); if (!theDir.exists()) theDir.mkdirs(); } private void Webcrawler() { } public static String calcNextUrl(String thisurl, String add) { System.out.println("This:["+thisurl + "]Add:["+add+"]"); URI thisuri = URI.create(thisurl); String data = thisuri.getScheme() + "://" + thisuri.getHost(); if(thisuri.getPort()!=-1) data=":"+thisuri.getPort(); if(add.startsWith("/")) data=data+add; else if(add.startsWith("http")) data=add; else { data=thisurl; if(data.endsWith("/")) data=data+add; else data=data+"/"+add; } URI returi = URI.create(data); returi = returi.normalize(); if( !returi.toString().startsWith("http") ){ System.out.println("Error"); } return returi.toString(); } public static void fileSave(String name,String data) { try { File newTextFile = new File(name); FileWriter fw = new FileWriter(newTextFile); fw.write(data); fw.close(); } catch (IOException iox) { iox.printStackTrace(); } } }
실행할때 아래와 같이 옵션을 주어야 합니다.
-u 는 접속시도하는 url이 됩니다.
-s 는 저장할 폴더 이름입니다.
-d 이건 최대 html의 depth를 의미합니다. 숫자가 크면 링크를 많이 타고 갑니다. 따라서 적당하게 적습니다.
-c 링크를 타고 갈때 다른 site가 나올수 있는데 몇번이나 허용할지 최대 허용 갯수를 의미합니다.
-u http://www.daum.net -s save -d 2 -c 1
실행화면
Welcome !! Webcrawler http://www.daum.net This:[http://www.daum.net]Add:[/] Get:[0]:http://www.daum.net/ HTTP/1.1 200 OK Final HTTP location: http://www.daum.net/ Len:(154748), A tag(170) This:[http://www.daum.net/]Add:[/doc/top_accessibility.html] Get:[1]:http://www.daum.net/doc/top_accessibility.html HTTP/1.1 200 OK Final HTTP location: http://www.daum.net/doc/top_accessibility.html Len:(28899), A tag(40) End Webcrawler
코드 설명
http://swlock.blogspot.com/2017/01/web-crawler-with-java_20.html