public class Node<T> { protected List<Node<T>> children;
protected Node<T> parent;
protected T value;
Node(T value) { this.value = value; }
public Node<T> add(T value) { if (null == children) { children = new ArrayList<Node<T>>(); } Node<T> child = new Node<T>(value); child.setParent(this); children.add(child); return child; }
privatevoid initLetterCountMap() throws IOException { String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat finalint len = letterResource.length(); char c; for (int i = 0; i < len; i++) { c = letterResource.charAt(i); if (TextUtils.isCnLetter(c)) { letterCountMap.increase(c); } } totalLetterCount = letterCountMap.count();
}
privatevoid initWordDic() throws IOException {
String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic finalint len = bytes.length(); String s = ""; char c; for (int i = 0; i < len; i++) { c = bytes.charAt(i);
if ('\n' == c || '\r' == c || 0 == c) { if (!TextUtils.isBlank(s)) { dictionary.add(s.trim()); } s = ""; } else { s += c; } if (0 == c) { break; } } }
/** * A word least appeared count */ privatestaticint LEAST_COUNT_THRESHOLD = 5;
/** * Threshold for solid rate calculated by word appeared count and every * single letter. * * The smaller this values is, more new words you will get, but with less * accuracy. The greater this value is, less new words you will get, but * with high accuracy. */ privatestaticdouble SOLID_RATE_THRESHOLD = 0.018;
/** * Threshold for entropy value calculated by candidate word prefix character * count and suffix character count * * The smaller this values is, more new words you will get, but with less * accuracy. The greater this value is, less new words you will get, but * with high accuracy. */ privatestaticdouble ENTROPY_THRESHOL = 1.92;
public EntropyJudger(TextIndexer indexer) { this.indexer = indexer; }
if (solidRate < SOLID_RATE_THRESHOLD) { returnfalse; }
double entropy = getEntropy(candidate);
if (entropy < ENTROPY_THRESHOL) { returnfalse; } returntrue; }
privatedouble getEntropy(String candidate) { Pos pos = new Pos(candidate); CountMap<Character> frontCountMap = new CountMap<Character>(); CountMap<Character> backCountMap = new CountMap<Character>(); finalint candidateLen = candidate.length(); int off = 0; char c; double rate, frontEntropy = 0, backEntropy = 0;
while (indexer.find(pos).isFound()) { off = pos.getPos();
c = indexer.charAt(off - 1); if (TextUtils.isCnLetter(c)) { frontCountMap.increase(c); } c = indexer.charAt(off + candidateLen); if (TextUtils.isCnLetter(c)) { backCountMap.increase(c); }
static { for (char c : structuralLetters) { structuralLetterSet.add(c); } }
public NewWordDiscover() { dictionary = CnDictionary.Instance(); }
/** * New word discover is based on statistic and entropy, better to sure * document size is in 100kb level, or you may get a unsatisfied result. * * @param document * @return */ public Set<String> discover(String document) {
Set<String> set = new HashSet<String>(); TextIndexer indexer = new CnPreviewTextIndexer(document); TextSelector selector = new CnTextSelector(document, MIN_CANDIDATE_LEN, MAX_CANDIDATE_LEN); EntropyJudger judger = new EntropyJudger(indexer); String candidate; while (!selector.end()) { candidate = selector.next(); if (TextUtils.isBlank(candidate)) { continue; } if (structuralLetterSet.contains(candidate.charAt(0)) || structuralLetterSet.contains(candidate.charAt(candidate .length() - 1))) { continue; } // Replace IF clause with "set.contains(candidate)" if you want to // find new word without any dictionary if (dictionary.contains(candidate) || set.contains(candidate)) { selector.select(); } elseif (judger.judge(candidate)) { set.add(candidate); } } returnset; } }
@Override publicintcount(String text){ int off = 0; int count = 0; finalint len = text.length(); while ((off = document.indexOf(text, off)) > -1) { count++; off += len; } return count; }
@Override publicPos find(Pos pos){ final String text = pos.getTarget(); finalint len = text.length(); int off = pos.getPos() + len; if (pos.getPos() < 0) off = 0;
/** * Define the max supposed word length * * You could shorten the value if you don't need too long participle result */ privatestaticfinalint MAX_WORD_LEN = 7;
/** * Define the predict level while execute participle. * * Negligible accuracy will be promoted if you increase this value */ privatestaticfinalint PREDICT_LEVEL = 3;
publicString next(Stringtext, int off) { Tree<String> root = new Tree<String>("ROOT"); recurse(root, off, text, 0); List<Node<String>> list = root.getLeaves(); List<Chunk> chunkList = new ArrayList<Chunk>(); for (Node<String> node : list) { chunkList.add(new Chunk(node.getBranchPath())); } Collections.sort(chunkList); return chunkList.get(0).getHead();
}
privatevoid recurse(Node<String> node, int off, Stringtext, int predictDeep) { int len = MAX_WORD_LEN + off > text.length() ? text.length() - off : MAX_WORD_LEN;
while (predictDeep < PREDICT_LEVEL) { if (len < 1) { return; }
String s = text.substring(off, off + len); if (len < 2) { if (!TextUtils.isCnLetter(text.charAt(off))) { break; } recurse(node.add(s), off + 1, text, predictDeep + 1); } elseif (dictionary.contains(s)) { recurse(node.add(s), off + s.length(), text, predictDeep + 1); } len--; } } }
MechanicalParticiple.java
package grid.text.participle;
import grid.common.TextUtils;
import java.util.Vector;
public class MechanicalParticiple {
public Vector<String> partition(String document) { Vector<String> vector = new Vector<String>(); finalint docLen = document.length(); int off = 0; char c; String seg = ""; ChunkStream stream = new ChunkStream();
while (off < docLen) { c = document.charAt(off); if (TextUtils.isEnLetter(c) || TextUtils.isNumeric(c)) { seg += c; off++; } elseif (TextUtils.isCnLetter(c)) { if (!TextUtils.isBlank(seg)) { vector.add(seg); seg = ""; } String word = stream.next(document, off); if (!TextUtils.isBlank(word)) { vector.add(word); off += word.length(); } } else { if (!TextUtils.isBlank(seg)) { vector.add(seg); seg = ""; }
/** * TODO: Uncomment the "ELSE IF" clause if you would like to * reserve punctuations */
Scanner scan = new Scanner(System.in); System.out.println("请输入您要处理的文件名称:\n"); String path = scan.next(); File file = new File(path); if (!file.exists() || (!file.isFile())) { thrownew Exception("指定文件不存在!"); } long maxsize = 1024 * 1024 * 1024;// 1G,超过这个值需要做文件切分 longsize = 1024 * 1024 * 5; // 子文件最大为100M long fileLength = file.length(); if (size <= 0) { size = fileLength / 2; } // 取得被分割后的小文件的数目 int num = (fileLength % size != 0) ? (int) (fileLength / size + 1) : (int) (fileLength / size); if (file.length() >= maxsize) { System.out.println("文件大小超出1G,是否开始进行文件切割?1:是 0:否\n");
int t = scan.nextInt(); if (t == 1) { TextDatReader.divide(path, size); System.out.println("切割完成\n"); System.out.println("结果保存在当前目录下的dat文件夹中\n");
} // System.out.println("请输入您要处理的文件序号,例如1代表dat文件架下的text1.dat\n"); // int m = scans.nextInt(); for (int m = 1; m <= num; m++) { String pathdived = "./dat/text" + m + ".dat"; System.out.println("开始提取第" + m + "个文件……"); discovrWord(pathdived); }