package com.star.crawlerweb;
import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern;
public class FileUtil { public static final String SUFFIX = ".dat";
@SuppressWarnings({ "unchecked" }) public static void extractedOther(String sourcePath, String resultPath) throws FileNotFoundException, IOException { StringBuilder builder = readSource(sourcePath); String pattenAttr = "\\/+[a-zA-Z]+"; String pattenall = "([\u4e00-\u9fa5]+)\\/+[a-zA-Z]+"; Map<String, Integer> mapattr = countWord(builder, pattenAttr); Map<String, Integer> mapall = countWord(builder, pattenall); FileUtil.writefile("=========分割线===========" + "\n", resultPath); Iterator<?> iterattr = mapattr.entrySet().iterator(); while (iterattr.hasNext()) { Map.Entry<String, Integer> entry = (Map.Entry<String, Integer>) iterattr .next(); Object key = entry.getKey(); Object val = entry.getValue(); if (Integer.parseInt(val.toString()) >= 5) { FileUtil.writefile(key.toString().replace("/", "") + " " + val + "\n", resultPath); } } FileUtil.writefile("=========分割线===========" + "\n", resultPath); Iterator<?> iterall = mapall.entrySet().iterator(); while (iterall.hasNext()) { Map.Entry<String, Integer> entry = (Map.Entry<String, Integer>) iterall .next(); Object key = entry.getKey(); Object val = entry.getValue(); if (Integer.parseInt(val.toString()) >= 5) { FileUtil.writefile(key.toString().replaceAll("/", " ") + " " + val + "\n", resultPath); } } }
public static final int BUFSIZE = 1024 * 8;
public static void mergeFiles(String outFile, String[] files) {
FileChannel outChannel = null; System.out.println("Merge " + Arrays.toString(files) + " into " + outFile); try { outChannel = new FileOutputStream(outFile).getChannel(); for (String f : files) { @SuppressWarnings("resource") FileChannel fc = new FileInputStream(f).getChannel(); ByteBuffer bb = ByteBuffer.allocate(BUFSIZE); while (fc.read(bb) != -1) { bb.flip(); outChannel.write(bb); bb.clear(); } fc.close(); } System.out.println("合并成功 "); } catch (IOException ioe) { ioe.printStackTrace(); } finally { try { if (outChannel != null) { outChannel.close(); } } catch (IOException ignore) { } } }
public static void resultCut() throws Exception {
String path = "resultAll.txt"; File file = new File(path); if (file.exists()&&file.isFile()) { file.delete(); } bigFileCut("resultcrawler.txt"); System.out.println("去重结果保存在resultAll.txt中" + "\n"); System.out.println("词数统计成功,结果保存在" + "resultcount.txt" + "中"); FileUtil.deleteDirectory("htmlfind"); FileUtil.deleteDirectory("htmlnext"); FileUtil.deleteHtml("./"); @SuppressWarnings("resource") Scanner scan = new Scanner(System.in); System.out.println("是否统计词性出现次数?是:1 否:0\n"); int flag = scan.nextInt(); if (flag == 1) { FileUtil.extractedOther("resultAll.txt", "resultcount.txt");
System.out.println("词数统计成功,结果保存在" + "resultcount.txt" + "中"); } }
private static void bigFileCut(String path) throws Exception, IOException, UnsupportedEncodingException { Set<String> set = new HashSet<String>(); long maxsize = 1024 * 1024 * 50; long size = 1024 * 1024 * 10; File file = new File(path); long fileLength = file.length(); if (size <= 0) { size = fileLength / 2; } int num = (fileLength % size != 0) ? (int) (fileLength / size + 1) : (int) (fileLength / size); if (file.length() >= maxsize) { FileUtil.divide(path, size); for (int m = 0; m < num; m++) { String pathdived = "./htmlfind/text" + m + ".dat"; System.out.println("开始提取第" + (m + 1) + "个文件……"); set.addAll(FileUtil.RemoveDuplicate(pathdived)); } } else { set.addAll(FileUtil.RemoveDuplicate(path)); } for (String i : set) { System.out.println("正在写入" + URLDecoder.decode(i, "utf-8") + "\n"); FileUtil.writefile(URLDecoder.decode(i, "utf-8") + "\n", "resultAll.txt"); } }
public static void deleteHtml(String path) { File file = new File(path); File temp = null; File[] filelist = file.listFiles(); for (int i = 0; i < filelist.length; i++) { temp = filelist[i]; if (temp.getName().endsWith("html")) { temp.delete(); } } }
@SuppressWarnings({ "unchecked" }) public static void extractedWord(String first, String sourcePath, String resultPath) throws IOException { StringBuilder builder = readSource(sourcePath); String pattenWord = "([\u4e00-\u9fa5]+)"; Map<String, Integer> mapword = countWord(builder, pattenWord); Iterator<?> iterword = mapword.entrySet().iterator(); while (iterword.hasNext()) { Map.Entry<String, Integer> entry = (Map.Entry<String, Integer>) iterword .next(); Object key = entry.getKey(); Object val = entry.getValue(); if (Integer.parseInt(val.toString()) >= 5) { if (isKey(first, pattenWord, key) == false) { FileUtil.writefile(first + "@" + key + ": " + val + "\n", resultPath); } }
} }
@SuppressWarnings("resource") public static StringBuilder readSource(String sourcePath) throws FileNotFoundException, IOException { File file = new File(sourcePath); FileReader fileReader = new FileReader(file); BufferedReader reader = new BufferedReader(fileReader); StringBuilder builder = new StringBuilder(); String line = ""; while ((line = reader.readLine()) != null) { builder.append(line); } return builder; }
public static boolean isKey(String first, String pattenWord, Object key) { Pattern pattern = Pattern.compile(pattenWord); Matcher matcher = pattern.matcher(key.toString()); Matcher matchers = pattern.matcher(first.toString()); while (matcher.find() && matchers.find()) { String keymatch = matcher.group(); String firstmathc = matchers.group(); if (keymatch.equals(firstmathc)) { return true; } } return false; }
public static Map<String, Integer> countWord(StringBuilder builder, String patten) { Pattern pattern = Pattern.compile(patten); String content = builder.toString(); Matcher matcher = pattern.matcher(content); Map<String, Integer> map = new HashMap<String, Integer>(); String word = ""; Integer times = 0; while (matcher.find()) { word = matcher.group(); if (map.containsKey(word)) { times = map.get(word); map.put(word, times + 1); } else { map.put(word, 1); } } return map;
}
public static Set<String> RemoveDuplicate(String path) throws IOException, UnsupportedEncodingException { Set<String> set = new HashSet<String>(); List<String> resultall = FileUtil.readfile(path); List<String> listTemp = new ArrayList<String>(); Iterator<String> it = resultall.iterator(); while (it.hasNext()) { String i = it.next(); if (listTemp.contains(i)) { it.remove(); } else { listTemp.add(i); } }
for (String i : listTemp) { set.add(i);
} return set;
}
public static void divide(String name, long size) throws Exception { File file = new File(name); if (!file.exists() || (!file.isFile())) { throw new Exception("指定文件不存在!"); } long fileLength = file.length(); if (size <= 0) { size = fileLength / 2; } int num = (fileLength % size != 0) ? (int) (fileLength / size + 1) : (int) (fileLength / size); String[] fileNames = new String[num]; FileInputStream in = new FileInputStream(file); long end = 0; int begin = 0; for (int i = 0; i < num; i++) { File outFile = new File("./htmlfind", "text" + i + SUFFIX); FileOutputStream out = new FileOutputStream(outFile); end += size; end = (end > fileLength) ? fileLength : end; for (; begin < end; begin++) { out.write(in.read()); } out.close(); fileNames[i] = outFile.getAbsolutePath(); System.out.println("第" + (i + 1) + "个子文件生成……");
} in.close(); }
public static List<String> readfile(String path) throws IOException { List<String> list = new ArrayList<String>(); File file = new File(path); FileInputStream s = new FileInputStream(file);
@SuppressWarnings("resource") BufferedReader reader = new BufferedReader(new InputStreamReader(s, "utf-8"), 5 * 1024 * 1024); String tempString = null;
while ((tempString = reader.readLine()) != null) {
String word = java.net.URLEncoder.encode(tempString, "utf-8"); list.add(word); } return list; }
public static void writefile(String m, String path) {
try { File file = new File(path); if (!file.exists()) { file.createNewFile(); } FileWriter fileWritter = new FileWriter(file.getName(), true); BufferedWriter bufferWritter = new BufferedWriter(fileWritter); bufferWritter.write(m); bufferWritter.close();
} catch (IOException e) { e.printStackTrace(); } }
public static boolean createDir(String destDirName) { File dir = new File(destDirName); if (dir.exists()) {
return false; } if (!destDirName.endsWith(File.separator)) { destDirName = destDirName + File.separator; } if (dir.mkdirs()) { return true; } else { return false; } }
public static boolean deleteDirectory(String sPath) { if (!sPath.endsWith(File.separator)) { sPath = sPath + File.separator; } File dirFile = new File(sPath); if (!dirFile.exists() || !dirFile.isDirectory()) { return false; } boolean flag = true; File[] files = dirFile.listFiles(); for (int i = 0; i < files.length; i++) { if (files[i].isFile()) { flag = deleteFile(files[i].getAbsolutePath()); if (!flag) break; } else { flag = deleteDirectory(files[i].getAbsolutePath()); if (!flag) break; } } if (!flag) return false; if (dirFile.delete()) { return true; } else { return false; } }
public static boolean deleteFile(String sPath) { boolean flag = false; File file = new File(sPath); if (file.isFile() && file.exists()) { file.delete(); flag = true; } return flag; }
public static void clearFile() { deleteDirectory("htmlfind"); deleteDirectory("htmlnext"); createDir("htmlfind"); createDir("htmlnext"); } }
|