文章目录

开始之前，先看一下从人人网中发现的90后用户爱用的词

是不是很好玩，哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词，这样就知道现在的年轻人喜欢什么了（对于博主这种上了年纪的人来说，真的是很有用，呜呜）

项目结构

当然，text.dat和common.dic这两个文件你可以随意替换，注意text.dat中的数据一定要够份量，否则没啥效果

原理么，看下Matrix67大牛的文章你就懂了

互联网时代的社会语言学：基于SNS的文本数据挖掘

处理数据下载

下边开始上代码

common

这个里边包含以下几个类，主要是定义数据结构

CountMap.java

定义一个计数Map来进行数据操作和持久化

package grid.common;

import java.io.Serializable;
import java.util.HashMap;


public class CountMap<T> extends HashMap<T, Integer> implements Serializable {

	private static final long serialVersionUID = 6097963798841161750L;

	public void increase(T t) {//添加元素
		Integer count = get(t);
		if (null == count) {
			put(t, 1);
		} else {
			put(t, ++count);
		}
	}

	public int count() {   //计数
		int count = 0;
		for (T t : keySet()) {
			count += get(t);
		}
		return count;
	}

	public int get(char c) {
		Integer count = super.get(c);
		return null == count ? 0 : count;
	}
}

Node.java

定义语法树的节点

package grid.common;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class Node<T> {
	protected List<Node<T>> children;

	protected Node<T> parent;

	protected T value;

	Node(T value) {
		this.value = value;
	}

	public Node<T> add(T value) {
		if (null == children) {
			children = new ArrayList<Node<T>>();
		}
		Node<T> child = new Node<T>(value);
		child.setParent(this);
		children.add(child);
		return child;
	}

	public T getValue() {
		return value;
	}

	public Node<T> getParent() {
		return parent;
	}

	public void setParent(Node<T> parent) {
		this.parent = parent;
	}
        //递归遍历孩子节点
	private void recurseChildren(List<Node<T>> list, Node<T> parent) {
		if (null == parent.children) {
			list.add(parent);
		} else {
			for (Node<T> node : parent.children) {
				recurseChildren(list, node);
			}
		}
	}

	public List<Node<T>> getLeaves() {
		List<Node<T>> list = new ArrayList<Node<T>>();
		recurseChildren(list, this);
		return list;

	}

	public List<T> getBranchPath() {
		List<T> list = new ArrayList<T>();
		Node<T> node = this;
		do {
			list.add(node.getValue());
			node = node.parent;
		} while (null != node && !(node instanceof Tree<?>));
		Collections.reverse(list);
		return list;
	}

	private void append(StringBuilder builder, int deep, Node<T> node) {
		for (int i = 0; i < deep; i++) {
			builder.append("  ");
		}
		builder.append("|--");
		builder.append(node.getValue());
		builder.append("\n");
		if (null != node.children) {
			for (Node<T> child : node.children) {
				append(builder, deep + 1, child);
			}
		}
	}

	public String dump() {
		StringBuilder builder = new StringBuilder();
		append(builder, 0, this);
		return builder.toString();
	}

	public String toString() {
		return value.toString();
	}
}

TextDatReader.java

读取处理数据

package grid.common;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;

public class TextDatReader {
//	 public static String read(String path) throws IOException {
//	        File file = new File(path);
//	        FileReader reader = new FileReader(file);
//	        char buffer[] = new char[(int) file.length()];
//	        reader.read(buffer);
//	        return new String(buffer);
//	    }
	@SuppressWarnings("resource")
	public static String read(String path) throws IOException {
		File file = new File(path);
		FileInputStream s = new FileInputStream(file);
		// 以utf8格式打开文件
//		FileReader fr = new FileReader(file);
		BufferedReader reader = new BufferedReader(new InputStreamReader(s,
				"utf8"));
		char buffer[] = new char[(int) file.length()];
		reader.read(buffer);
		return new String(buffer);
	}

	// 判断是否存在dat文件夹，没有的话就创建
	public static void createDir() {
		File file = new File("./dat");
		if (!file.exists() && !file.isDirectory()) {
			file.mkdir();
		}
	}

	public static final String SUFFIX = ".dat"; // 分割后的文件名后缀

	// 将指定的文件按着给定的文件的字节数进行分割文件，其中name指的是需要进行分割的文件名，size指的是指定的小文件的大小
	public static void divide(String name, long size) throws Exception {
		File file = new File(name);
		if (!file.exists() || (!file.isFile())) {
			throw new Exception("指定文件不存在！");
		}
		// 取得文件的大小
		long fileLength = file.length();
		if (size <= 0) {
			size = fileLength / 2;
		}
		// 取得被分割后的小文件的数目
		int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
				: (int) (fileLength / size);
		// 存放被分割后的小文件名
		String[] fileNames = new String[num];
		// 输入文件流，即被分割的文件
		FileInputStream in = new FileInputStream(file);
		// 读输入文件流的开始和结束下标
		long end = 0;
		int begin = 0;
		createDir();
		// 根据要分割的数目输出文件
		for (int i = 1; i <= num; i++) {
			// 对于前num - 1个小文件，大小都为指定的size
			File outFile = new File("./dat", "text" + i + SUFFIX);
			// 构建小文件的输出流
			FileOutputStream out = new FileOutputStream(outFile);
			// 将结束下标后移size
			end += size;
			end = (end > fileLength) ? fileLength : end;
			// 从输入流中读取字节存储到输出流中
			for (; begin < end; begin++) {
				out.write(in.read());
			}
			out.close();
			fileNames[i] = outFile.getAbsolutePath();
			System.out.println("第"+i+"个子文件生成……");
			
		}
		in.close();
	}

	// public static void main(final String[] args) throws Exception {
	// String name = "text.dat";
	// long size = 1024 * 1024 * 4;// 1K=1024b(字节),切割后每个文件为4M
	// TextDatReader.divide(name, size);
	//
	// }

}

TextUtils.java

用来做文本处理，如判断是否为空、匹配字符等

package grid.common;


public class TextUtils {

	public static boolean isCnLetter(char c) {//判断是否为中文字符
		return c >= 0x4E00 && c <= 0x9FCB;
	}

	public static boolean isNumeric(char c) {//判断是否为数字
			return c >= '0' && c <= '9';
	}

	public static boolean isEnLetter(char c) {//判断是否为英文字母
		return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
	}
        //字符串匹配
	public static boolean match(String src, int off, String dest) {
		int len = dest.length();
		int srcLen = src.length();
		for (int i = 0; i < len; i++) {
			if (srcLen <= off + i) {
				return false;
			}
			if (dest.charAt(i) != src.charAt(off + i)) {
				return false;
			}
		}
		return true;
	}
      //判断是否为空
	public static boolean isBlank(String str) {
		return null == str || str.isEmpty() || str.trim().isEmpty();
	}
}

Tree.java

语法树

package grid.common;


public class Tree<T> extends Node<T> {

	public Tree(T value) {
		super(value);
	}

}

dic

里边包含CnDictionary类

CnDictionary.java

词典处理

package grid.text.dic;

import grid.common.CountMap;
import grid.common.TextDatReader;
import grid.common.TextUtils;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;


public class CnDictionary {

	private final String COMMON_WORD_DIC_PATH = "common.dic";

	/**
	 * This text data is for character statistic. Change to your own if you
	 * like.
	 */
	private final String COMMON_LETTER_RESOURCE_PATH = "text.dat";

	private Set<String> dictionary = new HashSet<String>();

	private CountMap<Character> letterCountMap = new CountMap<Character>();

	private int totalLetterCount;

	private static CnDictionary instance;
//单例模式
	public static CnDictionary Instance() {
		if (null == instance) {
			try {
				instance = new CnDictionary();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return instance;
	}

	private CnDictionary() throws IOException {
		initWordDic();
		initLetterCountMap();
	}

	private void initLetterCountMap() throws IOException {
		String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat
		final int len = letterResource.length();
		char c;
		for (int i = 0; i < len; i++) {
			c = letterResource.charAt(i);
			if (TextUtils.isCnLetter(c)) {
				letterCountMap.increase(c);
			}
		}
		totalLetterCount = letterCountMap.count();

	}

	private void initWordDic() throws IOException {

		String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic
		final int len = bytes.length();
		String s = "";
		char c;
		for (int i = 0; i < len; i++) {
			c = bytes.charAt(i);

			if ('\n' == c || '\r' == c || 0 == c) {
				if (!TextUtils.isBlank(s)) {
					dictionary.add(s.trim());
				}
				s = "";
			} else {
				s += c;
			}
			if (0 == c) {
				break;
			}
		}
	}

	public boolean contains(String word) {
		return dictionary.contains(word);
	}

	public double rate(char c) {
		return (double) letterCountMap.get(c) / totalLetterCount;
	}

	public int size() {
		return dictionary.size();
	}
}

evolution

EntropyJudger.java

计算熵值

package grid.text.evolution;

import grid.common.CountMap;
import grid.common.TextUtils;
import grid.text.index.Pos;
import grid.text.index.TextIndexer;

public class EntropyJudger {

	private TextIndexer indexer;

	/**
	 * A word least appeared count
	 */
	private static int LEAST_COUNT_THRESHOLD = 5;

	/**
	 * Threshold for solid rate calculated by word appeared count and every
	 * single letter.
	 * 
	 * The smaller this values is, more new words you will get, but with less
	 * accuracy. The greater this value is, less new words you will get, but
	 * with high accuracy.
	 */
	private static double SOLID_RATE_THRESHOLD = 0.018;

	/**
	 * Threshold for entropy value calculated by candidate word prefix character
	 * count and suffix character count
	 * 
	 * The smaller this values is, more new words you will get, but with less
	 * accuracy. The greater this value is, less new words you will get, but
	 * with high accuracy.
	 */
	private static double ENTROPY_THRESHOL = 1.92;

	public EntropyJudger(TextIndexer indexer) {
		this.indexer = indexer;
	}

	public boolean judge(String candidate) {
		double solidRate = getSolidRate(candidate);

		if (solidRate < SOLID_RATE_THRESHOLD) {
			return false;
		}

		double entropy = getEntropy(candidate);

		if (entropy < ENTROPY_THRESHOL) {
			return false;
		}
		return true;
	}

	private double getEntropy(String candidate) {
		Pos pos = new Pos(candidate);
		CountMap<Character> frontCountMap = new CountMap<Character>();
		CountMap<Character> backCountMap = new CountMap<Character>();
		final int candidateLen = candidate.length();
		int off = 0;
		char c;
		double rate, frontEntropy = 0, backEntropy = 0;

		while (indexer.find(pos).isFound()) {
			off = pos.getPos();

			c = indexer.charAt(off - 1);
			if (TextUtils.isCnLetter(c)) {
				frontCountMap.increase(c);
			}
			c = indexer.charAt(off + candidateLen);
			if (TextUtils.isCnLetter(c)) {
				backCountMap.increase(c);
			}

		}

		for (char key : frontCountMap.keySet()) {
			rate = (double) frontCountMap.get(key) / frontCountMap.count();
			frontEntropy -= rate * Math.log(rate);
		}
		for (char key : backCountMap.keySet()) {
			rate = (double) backCountMap.get(key) / backCountMap.count();
			backEntropy -= rate * Math.log(rate);
		}

		return frontEntropy > backEntropy ? backEntropy : frontEntropy;

	}

	/**
	 * @param candidate
	 * @return
	 */
	public double getSolidRate(String candidate) {

		final int candidateLen = candidate.length();

		if (candidateLen < 2) {
			return 1;
		}

		final int count = indexer.count(candidate);
		double rate = 1;

		if (count < LEAST_COUNT_THRESHOLD) {
			return 0;
		}

		for (int i = 0; i < candidateLen; i++) {
			rate *= (double) count / indexer.count("" + candidate.charAt(i));
		}

		return Math.pow(rate, 1D / candidateLen) * Math.sqrt(candidateLen);
	}

	public void setIndexer(TextIndexer indexer) {
		this.indexer = indexer;
	}

}

NewWordDiscover.java

抽词程序

package grid.text.evolution;

import grid.common.TextUtils;
import grid.text.dic.CnDictionary;
import grid.text.index.CnPreviewTextIndexer;
import grid.text.index.TextIndexer;
import grid.text.selector.CnTextSelector;
import grid.text.selector.TextSelector;

import java.util.HashSet;
import java.util.Set;

public class NewWordDiscover {

	private CnDictionary dictionary;

	/**
	 * Minimum word length
	 */
	private final static int MIN_CANDIDATE_LEN = 2;

	/**
	 * Maximum word length
	 */
	private final static int MAX_CANDIDATE_LEN = 6;

	private static Set<Character> structuralLetterSet = new HashSet<Character>();

	private static char[] structuralLetters = { '我', '你', '您', '他', '她', '谁',
			'哪', '那', '这', '的', '了', '着', '也', '是', '有', '不', '在', '与', '呢',
			'啊', '呀', '吧', '嗯', '哦', '哈', '呐' };

	static {
		for (char c : structuralLetters) {
			structuralLetterSet.add(c);
		}
	}

	public NewWordDiscover() {
		dictionary = CnDictionary.Instance();
	}

	/**
	 * New word discover is based on statistic and entropy, better to sure
	 * document size is in 100kb level, or you may get a unsatisfied result.
	 * 
	 * @param document
	 * @return
	 */
	public Set<String> discover(String document) {

		Set<String> set = new HashSet<String>();
		TextIndexer indexer = new CnPreviewTextIndexer(document);
		TextSelector selector = new CnTextSelector(document, MIN_CANDIDATE_LEN,
				MAX_CANDIDATE_LEN);
		EntropyJudger judger = new EntropyJudger(indexer);
		String candidate;
		while (!selector.end()) {
			candidate = selector.next();
			if (TextUtils.isBlank(candidate)) {
				continue;
			}
			if (structuralLetterSet.contains(candidate.charAt(0))
					|| structuralLetterSet.contains(candidate.charAt(candidate
							.length() - 1))) {
				continue;
			}
			// Replace IF clause with "set.contains(candidate)" if you want to
			// find new word without any dictionary
			if (dictionary.contains(candidate) || set.contains(candidate)) {
				selector.select();
			} else if (judger.judge(candidate)) {
				set.add(candidate);
			}
		}
		return set;
	}
}

index

这几个类用于给词创建索引，方便从词典中找出

CnPreviewTextIndexer.java

package grid.text.index;

import grid.common.TextUtils;

import java.util.HashMap;
import java.util.Map;
import java.util.Vector;

public class CnPreviewTextIndexer implements TextIndexer {

	private final static int CN_LETTER_COUNT = 5021;

	private String document;

	private Map<Character, Vector<Integer>> posMap;

	public CnPreviewTextIndexer(String document) {
		this.document = document;
		init();
	}

	private void init() {
		final int len = document.length();

		final int supposedMinCount = 1 + (int) Math.log(len / CN_LETTER_COUNT
				+ 1);

		char c;

		Vector<Integer> posVector;

		posMap = new HashMap<Character, Vector<Integer>>(CN_LETTER_COUNT);

		for (int i = 0; i < len; i++) {
			c = document.charAt(i);
			if (!TextUtils.isCnLetter(c)) {
				continue;
			}
			posVector = posMap.get(c);
			if (null == posVector) {
				posVector = new Vector<Integer>(supposedMinCount);
				posMap.put(c, posVector);
			}
			posVector.add(i);
		}
	}

	@Override
	public int count(String text) {

		if (TextUtils.isBlank(text)) {
			return 0;
		}

		Vector<Integer> vector = posMap.get(text.charAt(0));

		if (null == vector) {
			return 0;
		}

		if (1 == text.length()) {
			return vector.size();
		}

		final int size = vector.size();
		int count = 0;

		for (int i = 0; i < size; i++) {
			if (TextUtils.match(document, vector.get(i), text)) {
				count++;
			}
		}

		return count;
	}

	@Override
	public Pos find(Pos pos) {
		String text = pos.getTarget();

		pos.setFound(false);

		if (TextUtils.isBlank(text)) {
			return pos;
		}

		Vector<Integer> vector = posMap.get(text.charAt(0));

		if (null == vector) {
			return pos;
		}

		final int arraySize = vector.size();
		final int arrayIndex = pos.arrayIndex + 1;

		for (int i = arrayIndex; i < arraySize; i++) {
			if (TextUtils.match(document, vector.get(i), text)) {
				pos.setFound(true);
				pos.setPos(vector.get(i));
				pos.arrayIndex = i;
				break;
			}
		}

		return pos;
	}

	@Override
	public int len() {
		return document.length();
	}

	@Override
	public String sub(int off, int len) {
		if (off < 0 || off + len >= document.length()) {
			return "";
		}
		return document.substring(off, off + len);
	}

	@Override
	public char charAt(int index) {
		if (index < 0 || index >= document.length()) {
			return 0;
		}
		return document.charAt(index);
	}
}

Pos.java

package grid.text.index;


public class Pos {
	private String target;

	/**
	 * Pos for current matched full target text
	 */
	private int pos = -1;

	/**
	 * Index in position array for current matched full target text
	 */
	int arrayIndex = -1;

	private boolean found = false;

	public Pos(String target) {
		this.target = target;
	}

	public String getTarget() {
		return target;
	}

	public int getPos() {
		return pos;
	}

	public boolean isFound() {
		return found;
	}

	void setPos(int pos) {
		this.pos = pos;
	}

	void setFound(boolean found) {
		this.found = found;
	}
}

SimpleTextIndexer.java

package grid.text.index;


public class SimpleTextIndexer implements TextIndexer {

	private String document;

	public SimpleTextIndexer(String document) {
		this.document = document;
	}

	@Override
	public int count(String text) {
		int off = 0;
		int count = 0;
		final int len = text.length();
		while ((off = document.indexOf(text, off)) > -1) {
			count++;
			off += len;
		}
		return count;
	}

	@Override
	public Pos find(Pos pos) {
		final String text = pos.getTarget();
		final int len = text.length();
		int off = pos.getPos() + len;
		if (pos.getPos() < 0)
			off = 0;

		pos.setFound(false);

		if ((off = document.indexOf(text, off)) > -1) {
			pos.setFound(true);
			pos.setPos(off);
		}
		return pos;
	}

	@Override
	public int len() {
		return document.length();
	}

	@Override
	public String sub(int off, int len) {
		return document.substring(off, off + len);
	}

	@Override
	public char charAt(int index) {
		if (index < 0 || index >= document.length()) {
			return 0;
		}
		return document.charAt(index);
	}
}

TextIndexer.java

package grid.text.index;


public interface TextIndexer {

	/**
	 * @param text
	 * @return count for specific text
	 */
	public int count(String text);

	/**
	 * @param pos
	 * @return next position for current pos
	 */
	public Pos find(Pos pos);

	/**
	 * @return original document length
	 */
	public int len();

	/**
	 * @param off
	 * @param len
	 * @return the sub string start from <b>off</b> and with a length with
	 *         <b>len</b>
	 */
	public String sub(int off, int len);

	/**
	 * @param index
	 * @return return the character in the specified index
	 */
	public char charAt(int index);
}

participle

分词处理，具体看实现

Chunk.java

package grid.text.participle;

import grid.text.dic.CnDictionary;

import java.util.List;


public class Chunk implements Comparable<Chunk> {

	private List<String> list;

	private int len = 0;

	private double avg = 0;

	private double variance = 0;

	public Chunk(List<String> list) {
		this.list = list;
		init();
	}

	private void init() {

		for (String s : list) {
			len += s.length();
		}
		avg = (double) len / list.size();

		for (String s : list) {
			variance += Math.pow(avg - s.length(), 2);
		}
		variance = Math.sqrt(variance);
	}

	public int getLen() {
		return len;
	}

	public double getAvg() {
		return avg;
	}

	public double getVariance() {
		return variance;
	}

	public String getHead() {
		if (null == list || list.isEmpty()) {
			return "";
		}
		return list.get(0);
	}

	private int compareDouble(double d1, double d2) {
		if (d1 - d2 < -0.0000001D) {
			return 1;
		} else if (d1 - d2 > 0.0000001D) {
			return -1;
		}
		return 0;
	}

	@Override
	public int compareTo(Chunk o) {

		if (len != o.len) {
			return o.len - len;
		}

		int d = compareDouble(avg, o.avg);
		if (0 != d) {
			return d;
		}

		d = compareDouble(variance, o.variance);
		if (0 != d) {
			return d;
		}

		CnDictionary dictionary = CnDictionary.Instance();

		double rateSrc = 0, rateDest = 0;
		for (String s : list) {
			if (1 == s.length()) {
				rateSrc += dictionary.rate(s.charAt(0));
			}
		}
		for (String s : o.list) {
			if (1 == s.length()) {
				rateDest += dictionary.rate(s.charAt(0));
			}
		}
		return compareDouble(rateSrc, rateDest);
	}

	public String toString() {
		return list.toString();
	}
}

ChunkStream.java

package grid.text.participle;

import grid.common.Node;
import grid.common.TextUtils;
import grid.common.Tree;
import grid.text.dic.CnDictionary;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class ChunkStream {

	/**
	 * Define the max supposed word length
	 * 
	 * You could shorten the value if you don't need too long participle result
	 */
	private static final int MAX_WORD_LEN = 7;

	/**
	 * Define the predict level while execute participle.
	 * 
	 * Negligible accuracy will be promoted if you increase this value
	 */
	private static final int PREDICT_LEVEL = 3;

	private static CnDictionary dictionary = CnDictionary.Instance();

	public String next(String text, int off) {
		Tree<String> root = new Tree<String>("ROOT");
		recurse(root, off, text, 0);
		List<Node<String>> list = root.getLeaves();
		List<Chunk> chunkList = new ArrayList<Chunk>();
		for (Node<String> node : list) {
			chunkList.add(new Chunk(node.getBranchPath()));
		}
		Collections.sort(chunkList);
		return chunkList.get(0).getHead();

	}

	private void recurse(Node<String> node, int off, String text,
			int predictDeep) {
		int len = MAX_WORD_LEN + off > text.length() ? text.length() - off
				: MAX_WORD_LEN;

		while (predictDeep < PREDICT_LEVEL) {
			if (len < 1) {
				return;
			}

			String s = text.substring(off, off + len);
			if (len < 2) {
				if (!TextUtils.isCnLetter(text.charAt(off))) {
					break;
				}
				recurse(node.add(s), off + 1, text, predictDeep + 1);
			} else if (dictionary.contains(s)) {
				recurse(node.add(s), off + s.length(), text, predictDeep + 1);
			}
			len--;
		}
	}
}

MechanicalParticiple.java

package grid.text.participle;

import grid.common.TextUtils;

import java.util.Vector;


public class MechanicalParticiple {

	public Vector<String> partition(String document) {
		Vector<String> vector = new Vector<String>();
		final int docLen = document.length();
		int off = 0;
		char c;
		String seg = "";
		ChunkStream stream = new ChunkStream();

		while (off < docLen) {
			c = document.charAt(off);
			if (TextUtils.isEnLetter(c) || TextUtils.isNumeric(c)) {
				seg += c;
				off++;
			} else if (TextUtils.isCnLetter(c)) {
				if (!TextUtils.isBlank(seg)) {
					vector.add(seg);
					seg = "";
				}
				String word = stream.next(document, off);
				if (!TextUtils.isBlank(word)) {
					vector.add(word);
					off += word.length();
				}
			} else {
				if (!TextUtils.isBlank(seg)) {
					vector.add(seg);
					seg = "";
				}

				/**
				 * TODO: Uncomment the "ELSE IF" clause if you would like to
				 * reserve punctuations
				 */

				// else if (!TextUtils.isBlank("" + c)) { vector.add("" + c); }

				off++;
			}
		}
		if (!TextUtils.isBlank(seg)) {
			vector.add(seg);
		}
		return vector;

	}
}

selector

文本选择器，筛选出可能为新词的词汇

CnTextSelector.java

package grid.text.selector;

import grid.common.TextUtils;


public class CnTextSelector extends CommonTextSelector {

	public CnTextSelector(String document, int minSelectLen, int maxSelectLen) {
		super(document, minSelectLen, maxSelectLen);
	}

	protected void adjustCurLen() {
		while (pos < docLen && !TextUtils.isCnLetter(document.charAt(pos))) {
			pos++;
		}
		for (int i = 0; i < maxSelectLen && pos + i < docLen; i++) {
			if (!TextUtils.isCnLetter(document.charAt(pos + i))) {
				curLen = i;
				if (curLen < minSelectLen) {
					pos++;
					adjustCurLen();
				}
				return;
			}
		}

		curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen;
	}
}

CommonTextSelector.java

package grid.text.selector;


public class CommonTextSelector implements TextSelector {

	protected String document;

	protected int pos = 0;

	protected int maxSelectLen = 5;

	protected int minSelectLen = 2;

	protected int curLen;

	protected final int docLen;

	public CommonTextSelector(String document, int minSelectLen,
			int maxSelectLen) {
		this.document = document;
		this.minSelectLen = minSelectLen;
		this.maxSelectLen = maxSelectLen;
		docLen = document.length();
		adjustCurLen();
	}

	public void select() {
		pos += ++curLen;
		adjustCurLen();
	}

	protected void adjustCurLen() {
		curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen;
	}

	public String next() {
		if (curLen < minSelectLen) {
			pos++;
			adjustCurLen();
		}

		if (pos + curLen <= docLen && curLen >= minSelectLen) {
			return document.substring(pos, pos + curLen--);
		} else {
			curLen--;
			// return document.substring(pos, docLen);
			return "";
		}
	}

	public boolean end() {
		return curLen < minSelectLen && curLen + pos >= docLen - 1;
	}

	@Override
	public int getCurPos() {
		return pos;
	}
}

TextSelector.java

package grid.text.selector;


public interface TextSelector {
	public boolean end();

	public void select();

	public String next();

	public int getCurPos();

}

测试代码

NewWordDiscoverTest.java

package grid.test;

import grid.common.TextDatReader;
import grid.text.evolution.NewWordDiscover;
import grid.text.index.CnPreviewTextIndexer;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Scanner;
import java.util.Set;

public class NewWordDiscoverTest {
	public static void writefile(String m) {

		try {
			File file = new File("result.txt");
			if (!file.exists()) {
				file.createNewFile();
			}
			FileWriter fileWritter = new FileWriter(file.getName(), true);
			BufferedWriter bufferWritter = new BufferedWriter(fileWritter);
			bufferWritter.write(m);
			bufferWritter.close();

		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	@SuppressWarnings("resource")
	public static void main(String[] args) throws Exception {
		// 开始之前，清空result.txt，避免数据重复
		File filere = new File("result.txt");
		filere.delete();

		Scanner scan = new Scanner(System.in);
		System.out.println("请输入您要处理的文件名称:\n");
		String path = scan.next();
		File file = new File(path);
		if (!file.exists() || (!file.isFile())) {
			throw new Exception("指定文件不存在！");
		}
		long maxsize = 1024 * 1024 * 1024;// 1G,超过这个值需要做文件切分
		long size = 1024 * 1024 * 5; // 子文件最大为100M
		long fileLength = file.length();
		if (size <= 0) {
			size = fileLength / 2;
		}
		// 取得被分割后的小文件的数目
		int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
				: (int) (fileLength / size);
		if (file.length() >= maxsize) {
			System.out.println("文件大小超出1G，是否开始进行文件切割？1:是 0:否\n");

			int t = scan.nextInt();
			if (t == 1) {
				TextDatReader.divide(path, size);
				System.out.println("切割完成\n");
				System.out.println("结果保存在当前目录下的dat文件夹中\n");

			}
			// System.out.println("请输入您要处理的文件序号，例如1代表dat文件架下的text1.dat\n");
			// int m = scans.nextInt();
			for (int m = 1; m <= num; m++) {
				String pathdived = "./dat/text" + m + ".dat";
				System.out.println("开始提取第" + m + "个文件……");
				discovrWord(pathdived);
			}

		} else {
			System.out.println("开始提取文件……");
			discovrWord(path);
		}
	}

	private static void discovrWord(String path) throws IOException {
		String document = TextDatReader.read(path);
		NewWordDiscover discover = new NewWordDiscover();
		Set<String> words = discover.discover(document);
		CnPreviewTextIndexer ci = new CnPreviewTextIndexer(document);
//		long start = System.currentTimeMillis();
//		System.out.println("耗时: " + (double) document.length()
//				/ (System.currentTimeMillis() - start) * 1000);
		System.out.println("新词个数: " + words.size());
		System.out.println("发现的新词:" + "\n");
		for (String newword : words) {
			System.out.println(newword + "," + ci.count(newword) + "\n");// 发现新词后，统计每个新词出现的次数
			writefile(newword + "," + ci.count(newword) + "\n");
		}
	}
}

抽词测试，结果如下

ParticipleTest.java

package grid.test;

import grid.text.participle.MechanicalParticiple;

import java.util.Vector;


public class ParticipleTest {

	private static String document = "我是中国人";

	public static void main(String args[]) {
		MechanicalParticiple participle = new MechanicalParticiple();
		Vector<String> vec = participle.partition(document);
		System.out.println(vec);
	}
}

分词测试，结果如下

怎么样，很酷吧，你还可以试着用《天龙八部》数据集玩下，看看主角是不是乔帮主。如果发现了什么新鲜词，请告诉博主，咱也不落后哈！

VIP独享–天龙八部新词，如果想看结果请心里默夸博主一百次

执行以上步骤后再送您一份哈利波特版的

XingLiu's Blog

利用文本挖掘技术来找出《天龙八部》中的"小鲜词"