SW정리: zip

Java 에서 Zip 압축 사용하기

대부분의 Zip예제들은 파일이 존재하고 존재하는 파일들을 압축하는 예제들입니다.
하지만 여기에서는 존재하는 파일을 압축하는것이 아니라 저장해야 하는 데이터를 압축해서 저장하는 예제입니다.

Zip 예제

package prj.dish;

import java.io.File;
import java.io.FileOutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

// 1. init() 
// 2. add()
// 3. close()

public class MZip {
 FileOutputStream fos = null;
 ZipOutputStream zos = null;

 public void init(String output) throws Throwable{
  try{
   fos = new FileOutputStream(new File(output));
   zos = new ZipOutputStream(fos);

  }catch(Throwable e){
   throw e;
  }
 }

 public void add(String name,byte data[]) throws Throwable{
  ZipEntry ze= new ZipEntry(name);
  zos.putNextEntry(ze);
  zos.write(data);
  zos.closeEntry();
 }

 public void close() throws Throwable{
  try{
   if(zos != null) zos.close();
   if(fos != null) fos.close();
  }catch(Throwable e){
   throw e;
  }
 }

 public static void main(String[] args) throws Throwable {
  MZip mzip = new MZip();
  mzip.init("zipfile.zip");
  mzip.add("A/data1", "11111".getBytes());
  mzip.add("A/data2", "22222".getBytes());
  mzip.close();
 }
}

실행코드 설명

  MZip mzip = new MZip();
  mzip.init("zipfile.zip");
  mzip.add("A/data1", "11111".getBytes());
  mzip.add("A/data2", "22222".getBytes());
  mzip.close();

MZip Class를 만들었습니다. 호출 순서는 init()->add()->close() 순서 입니다.
init()는 생성할 zip파일이름을 인자로 넘겨줍니다.
add()는 zip안에 개별적으로 존재하는 파일명과 데이터를 넘겨줍니다.
close()는 열어 놓은 stream을 닫습니다.

위 main코드를 실행하면 아래와 같은 결과가 나옵니다.

zipfiles.zip 파일이 생성되고 압축을 풀어보면 A폴더가 생기고 data1,data2 파일이 아래와 같습니다.

해당 코드를 이미 작성한 webcrawler에 넣어 봤습니다.

기존에는 그냥 파일만 생성했는데요, 이번에는 생성되는 파일이 zip형태로 압축되어 생성되도록 변경하였습니다.

원본 코드 : http://swlock.blogspot.com/2017/01/web-crawler-with-java.html
원본 설명 : http://swlock.blogspot.com/2017/01/web-crawler-with-java_20.html

webcrawler zip추가된 소스
MZip Class를 추가 해야함

package prj.dish;

import java.io.ByteArrayOutputStream;
import java.io.Console;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.utils.URIUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import net.htmlparser.jericho.Config;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.LoggerProvider;
import net.htmlparser.jericho.Source;

// Example : java -jar WebCrawler.jar -u http://finance.daum.net -s E:\webdata
// done : 2017.4.8 출력 파일 zip 압축하기

public class Webcrawler {
 public static boolean SUPPORT_MZIP = true;

 private MZip mzip = null;
 private int maxDepth = 1;
 private int maxHostChange = 1;
 private String savePath;
 private String host;
 boolean DOMAIN_CHANGE = true;
 byte[] htmlByte = null;
 HashSet<String> visited = new HashSet<String>();
 CloseableHttpClient httpclient = HttpClients.createDefault();


 public static void main(String[] args) {
  System.out.println("Welcome !! Webcrawler");
  Config.LoggerProvider=LoggerProvider.DISABLED;
  System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); 
  if(args.length >= 1){
   Options options = new Options();

   Option savepath = new Option("s", "savepath", true, "input save folder file path");
   savepath.setRequired(true);
   options.addOption(savepath);

   Option url = new Option("u", "url", true, "url ex) http://www.daum.net");
   url.setRequired(true);
   options.addOption(url);

   Option depth = new Option("d", "depth", true, "max depth");
   depth.setRequired(false);
   options.addOption(depth);

   Option changehostdepth = new Option("c", "changehostdepth", true, "change host depth");
   changehostdepth.setRequired(false);
   options.addOption(changehostdepth);

   CommandLineParser parser = new DefaultParser();
   HelpFormatter formatter = new HelpFormatter();
   CommandLine cmd;

   try {
    cmd = parser.parse(options, args);
   } catch (ParseException e) {
    System.out.println(e.getMessage());
    formatter.printHelp("Webcrawler", options);
    System.exit(1);
    return;
   }

   String saveFilePath = cmd.getOptionValue("savepath");
   String urlPath = cmd.getOptionValue("url");
   String depthParam = cmd.getOptionValue("depth");
   if(depthParam==null || depthParam.isEmpty()) depthParam = "2";
   String changehostdepthdepthParam = cmd.getOptionValue("changehostdepth");
   if(changehostdepthdepthParam==null || changehostdepthdepthParam.isEmpty()) changehostdepthdepthParam = "1";
   System.out.println(urlPath);
   Webcrawler crawler;
   crawler = new Webcrawler();
   crawler.setSavePath(saveFilePath);
   crawler.setMaxDepth(Integer.valueOf(depthParam));
   crawler.setMaxHostChange(Integer.valueOf(changehostdepthdepthParam));
   crawler.run(urlPath);
  }
  System.out.println("End Webcrawler");
 }

 private void run(String string) {
  host = string;
  connect( host, "/", 0, 0);

  // MZip
  if( SUPPORT_MZIP ){
   try {
    mzip.close();
   } catch (Throwable e) {
    e.printStackTrace();
    exitWait();
   }
  }
 }
 private void exitWait() {
  Console console = System.console();
  console.readLine();
 }

 public String getString() {
  try {
   return new String(htmlByte, "UTF-8");
  } catch (UnsupportedEncodingException e) {
   e.printStackTrace();
  }
  return null;
 }
 private String getHttp(String url) throws IOException, URISyntaxException{
  String ret=null;
  try {
   HttpGet httpGet = new HttpGet(url);
   HttpClientContext context = HttpClientContext.create();
   httpGet.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip");
   CloseableHttpResponse response = httpclient.execute(httpGet,context);
   try {
    System.out.println(response.getStatusLine());
    HttpEntity entity = response.getEntity();

    Header contentEncoding = response.getFirstHeader("Content-Encoding");
    if (contentEncoding != null && contentEncoding.getValue().equalsIgnoreCase("gzip")) {
     System.out.println("gziped");
     htmlByte = inputStreamToByte( new GZIPInputStream(entity.getContent()));
    }else {
     htmlByte = inputStreamToByte(entity.getContent());
    }

    HttpHost target = context.getTargetHost();
    List<URI> redirectLocations = context.getRedirectLocations();
    URI location = URIUtils.resolve(httpGet.getURI(), target, redirectLocations);
    System.out.println("Final HTTP location: " + location.toASCIIString());
    ret = location.toASCIIString();
   } finally {
    response.close();
   }
  } finally {
   //httpclient.close();
  }
  return ret;
 }
 private byte[] inputStreamToByte(InputStream in)
 {
  final int BUF_SIZE = 1024;
  ByteArrayOutputStream out = new ByteArrayOutputStream();
  byte[] buffer = new byte[BUF_SIZE];
  try {
   int length;
   while ((length = in.read(buffer)) != -1) out.write(buffer, 0, length);
  } catch (IOException e) {
   e.printStackTrace();
   return null;
  }
  return out.toByteArray();
 }
 private void connect(String lasturl, String addurl, int depth, int hostchange) {
  Source source = null;
  String newurl = null;
  int hostchanged = 0;

  if(addurl.startsWith("http://") || addurl.startsWith("https://")){
   hostchanged = 1;
   if( maxHostChange <= hostchange+hostchanged) return;
  }
  if( maxDepth <= depth ){
   return;
  }
  try {
   //if(DOMAIN_CHANGE){
   lasturl = calcNextUrl(lasturl, addurl);
   //}else{
   //lasturl = urlChg(host, lasturl, addurl);
   //}
   System.out.println("Get:["+depth+"]:"+lasturl);
   if( !visited.contains(lasturl) ){
    visited.add(lasturl);
   }else{
    System.out.println("visited !");
    return;
   }
   //source=new Source(new URL(lasturl));
   newurl = getHttp(lasturl);
   //fileSave(savePath + changeFileName(lasturl)+".htm",getString());

   // MZip
   if( SUPPORT_MZIP ){
    try {
     mzip.add(changeFileName(lasturl)+".htm", htmlByte);
    } catch (Throwable e) {
     e.printStackTrace();
    }
   }else{
    fileSave(savePath + changeFileName(lasturl)+".htm",htmlByte);
   }

   source=new Source(getString());

  } catch (Exception e) {
   e.printStackTrace();
   return;
  }
  //System.out.println(source.getRenderer().toString());
  List <Element> elements = source.getAllElements("a");
  System.out.println("Len:("+htmlByte.length+"), A tag("+elements.size()+")");
  for(int i = 0 ; i < elements.size(); i++){
   Element ele = elements.get(i);
   String href = ele.getAttributeValue("href");
   if(href==null || href.isEmpty()) continue;
   if(!DOMAIN_CHANGE){
    if(href.startsWith("http://") || href.startsWith("https://")){
     continue;
    }
   }
   if(href.startsWith("javascript:")){
    continue;
   }else if(href.contains("#")){
    continue;
   }else if(href.startsWith("<")){
    continue;
   }
   connect(newurl,href,depth+1,hostchange+hostchanged);
  }
 }

 private void fileSave(String name, byte[] htmlByte) {
  FileOutputStream stream = null;
  try{
   stream = new FileOutputStream(name);
   stream.write(htmlByte);
  } catch (Exception e) {
  } finally {
   try {
    stream.close();
   } catch (IOException e) {
   }
  }
 }

 private String changeFileName(String lasturl) {
  lasturl=lasturl.replace('?', '_');
  lasturl=lasturl.replace('*', '_');
  lasturl=lasturl.replace('%', '_');
  lasturl=lasturl.replace('.', '_');
  lasturl=lasturl.replace('/', '_');
  lasturl=lasturl.replace('\\', '_');
  lasturl=lasturl.replace('\"', '_');
  lasturl=lasturl.replace('\'', '_');
  lasturl=lasturl.replace('|', '_');
  lasturl=lasturl.replace('+', '_');
  lasturl=lasturl.replace('-', '_');
  lasturl=lasturl.replace(':', '_');
  return lasturl;
 }

 private void setMaxDepth(int i) {
  maxDepth = i;
 }

 private void setMaxHostChange(int i) {
  maxHostChange = i;
 }

 private void setSavePath(String string) {
  savePath = string;
  if(!savePath.endsWith("/")) savePath=savePath+"/";
  createDirectoryIfNeeded(string);
  String timeStamp = new SimpleDateFormat("yyyy.MM.dd.HH.mm").format(new Date());
  savePath=savePath+timeStamp;
  if( SUPPORT_MZIP ) {
   mzip = new MZip();
   try {
    mzip.init(savePath+".zip");
   } catch (Throwable e) {
    e.printStackTrace();
   }
  }else{
   createDirectoryIfNeeded(savePath);
   if(!savePath.endsWith("/")) savePath=savePath+"/";
  }
 }

 private void createDirectoryIfNeeded(String directoryName)
 {
  File theDir = new File(directoryName); 
  if (!theDir.exists())
   theDir.mkdirs();
 }

 private void Webcrawler() {
 }

 public static String calcNextUrl(String thisurl, String add)
 {
  System.out.println("This:["+thisurl + "]Add:["+add+"]");
  URI thisuri = URI.create(thisurl);
  String data = thisuri.getScheme() + "://" + thisuri.getHost();
  if(thisuri.getPort()!=-1) data=":"+thisuri.getPort();
  if(add.startsWith("/")) data=data+add;
  else if(add.startsWith("http")) data=add;
  else {
   data=thisurl;
   if(data.endsWith("/")) data=data+add;
   else data=data+"/"+add;
  }

  URI returi = URI.create(data);
  returi = returi.normalize();
  if( !returi.toString().startsWith("http") ){
   System.out.println("Error");
  }
  return returi.toString();
 }

 public static void fileSave(String name,String data)
 {
  try {
   File newTextFile = new File(name);
   FileWriter fw = new FileWriter(newTextFile);
   fw.write(data);
   fw.close();
  } catch (IOException iox) {
   iox.printStackTrace();
  }
 }
}

기존 설명 대비 추가되는 코드는 SUPPORT_MZIP 로 검색하면 추가된 내용만 볼 수 있습니다.

테스트 실행

테스트 실행시 사용한 인자
-u http://www.daum.net -s save -d 2 -c 1

SW정리

2018년 4월 1일 일요일

tar 사용법

TAR

파일 뭉치기

파일 풀기

2017년 4월 8일 토요일

java에서 Zip 압축 사용하기

Java 에서 Zip 압축 사용하기

해당 코드를 이미 작성한 webcrawler에 넣어 봤습니다.

테스트 실행