文章目录
  1. 1. 项目结构
  2. 2. common
    1. 2.1. CountMap.java
    2. 2.2. Node.java
    3. 2.3. TextDatReader.java
    4. 2.4. TextUtils.java
    5. 2.5. Tree.java
  3. 3. dic
    1. 3.1. CnDictionary.java
  4. 4. evolution
    1. 4.1. EntropyJudger.java
    2. 4.2. NewWordDiscover.java
  5. 5. index
    1. 5.1. CnPreviewTextIndexer.java
    2. 5.2. Pos.java
    3. 5.3. SimpleTextIndexer.java
    4. 5.4. TextIndexer.java
  6. 6. participle
    1. 6.1. Chunk.java
    2. 6.2. ChunkStream.java
    3. 6.3. MechanicalParticiple.java
  7. 7. selector
    1. 7.1. CnTextSelector.java
    2. 7.2. CommonTextSelector.java
    3. 7.3. TextSelector.java
  8. 8. 测试代码
    1. 8.1. NewWordDiscoverTest.java
    2. 8.2. ParticipleTest.java

开始之前,先看一下从人人网中发现的90后用户爱用的词

是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜)

项目结构

当然,text.dat和common.dic这两个文件你可以随意替换,注意text.dat中的数据一定要够份量,否则没啥效果

原理么,看下Matrix67大牛的文章你就懂了

互联网时代的社会语言学:基于SNS的文本数据挖掘

处理数据下载

下边开始上代码

common

这个里边包含以下几个类,主要是定义数据结构

CountMap.java

定义一个计数Map来进行数据操作和持久化

package grid.common;

import java.io.Serializable;
import java.util.HashMap;


public class CountMap<T> extends HashMap<T, Integer> implements Serializable {

private static final long serialVersionUID = 6097963798841161750L;

public void increase(T t) {//添加元素
Integer count = get(t);
if (null == count) {
put(t, 1);
} else {
put(t, ++count);
}
}

public int count() { //计数
int count = 0;
for (T t : keySet()) {
count += get(t);
}
return count;
}

public int get(char c) {
Integer count = super.get(c);
return null == count ? 0 : count;
}
}

Node.java

定义语法树的节点

package grid.common;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class Node<T> {
protected List<Node<T>> children;

protected Node<T> parent;

protected T value;

Node(T value) {
this.value = value;
}

public Node<T> add(T value) {
if (null == children) {
children = new ArrayList<Node<T>>();
}
Node<T> child = new Node<T>(value);
child.setParent(this);
children.add(child);
return child;
}

public T getValue() {
return value;
}

public Node<T> getParent() {
return parent;
}

public void setParent(Node<T> parent) {
this.parent = parent;
}
//递归遍历孩子节点
private void recurseChildren(List<Node<T>> list, Node<T> parent) {
if (null == parent.children) {
list.add(parent);
} else {
for (Node<T> node : parent.children) {
recurseChildren(list, node);
}
}
}

public List<Node<T>> getLeaves() {

List<Node<T>> list = new ArrayList<Node<T>>();
recurseChildren(list, this);
return list;

}

public List<T> getBranchPath() {
List<T> list = new ArrayList<T>();
Node<T> node = this;
do {
list.add(node.getValue());
node = node.parent;
} while (null != node && !(node instanceof Tree<?>));
Collections.reverse(list);
return list;
}

private void append(StringBuilder builder, int deep, Node<T> node) {
for (int i = 0; i < deep; i++) {
builder.append(" ");
}
builder.append("|--");
builder.append(node.getValue());
builder.append("\n");
if (null != node.children) {
for (Node<T> child : node.children) {

append(builder, deep + 1, child);
}
}
}

public String dump() {
StringBuilder builder = new StringBuilder();
append(builder, 0, this);
return builder.toString();
}

public String toString() {
return value.toString();
}
}

TextDatReader.java

读取处理数据

package grid.common;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;

public class TextDatReader {
// public static String read(String path) throws IOException {
// File file = new File(path);
// FileReader reader = new FileReader(file);
// char buffer[] = new char[(int) file.length()];
// reader.read(buffer);
// return new String(buffer);
// }
@SuppressWarnings("resource")
public static String read(String path) throws IOException {
File file = new File(path);
FileInputStream s = new FileInputStream(file);
// 以utf8格式打开文件
// FileReader fr = new FileReader(file);
BufferedReader reader = new BufferedReader(new InputStreamReader(s,
"utf8"));
char buffer[] = new char[(int) file.length()];
reader.read(buffer);
return new String(buffer);
}

// 判断是否存在dat文件夹,没有的话就创建
public static void createDir() {
File file = new File("./dat");
if (!file.exists() && !file.isDirectory()) {
file.mkdir();
}
}

public static final String SUFFIX = ".dat"; // 分割后的文件名后缀

// 将指定的文件按着给定的文件的字节数进行分割文件,其中name指的是需要进行分割的文件名,size指的是指定的小文件的大小
public static void divide(String name, long size) throws Exception {
File file = new File(name);
if (!file.exists() || (!file.isFile())) {
throw new Exception("指定文件不存在!");
}
// 取得文件的大小
long fileLength = file.length();
if (size <= 0) {
size = fileLength / 2;
}
// 取得被分割后的小文件的数目
int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
: (int) (fileLength / size);
// 存放被分割后的小文件名
String[] fileNames = new String[num];
// 输入文件流,即被分割的文件
FileInputStream in = new FileInputStream(file);
// 读输入文件流的开始和结束下标
long end = 0;
int begin = 0;
createDir();
// 根据要分割的数目输出文件
for (int i = 1; i <= num; i++) {
// 对于前num - 1个小文件,大小都为指定的size
File outFile = new File("./dat", "text" + i + SUFFIX);
// 构建小文件的输出流
FileOutputStream out = new FileOutputStream(outFile);
// 将结束下标后移size
end += size;
end = (end > fileLength) ? fileLength : end;
// 从输入流中读取字节存储到输出流中
for (; begin < end; begin++) {
out.write(in.read());
}
out.close();
fileNames[i] = outFile.getAbsolutePath();
System.out.println("第"+i+"个子文件生成……");

}
in.close();
}

// public static void main(final String[] args) throws Exception {
// String name = "text.dat";
// long size = 1024 * 1024 * 4;// 1K=1024b(字节),切割后每个文件为4M
// TextDatReader.divide(name, size);
//
// }

}

TextUtils.java

用来做文本处理,如判断是否为空、匹配字符等

package grid.common;


public class TextUtils {

public static boolean isCnLetter(char c) {//判断是否为中文字符
return c >= 0x4E00 && c <= 0x9FCB;
}

public static boolean isNumeric(char c) {//判断是否为数字
return c >= '0' && c <= '9';
}

public static boolean isEnLetter(char c) {//判断是否为英文字母
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
//字符串匹配
public static boolean match(String src, int off, String dest) {
int len = dest.length();
int srcLen = src.length();
for (int i = 0; i < len; i++) {
if (srcLen <= off + i) {
return false;
}
if (dest.charAt(i) != src.charAt(off + i)) {
return false;
}
}
return true;
}
//判断是否为空
public static boolean isBlank(String str) {
return null == str || str.isEmpty() || str.trim().isEmpty();
}
}

Tree.java

语法树

package grid.common;


public class Tree<T> extends Node<T> {

public Tree(T value) {
super(value);
}

}

dic

里边包含CnDictionary类

CnDictionary.java

词典处理

package grid.text.dic;

import grid.common.CountMap;
import grid.common.TextDatReader;
import grid.common.TextUtils;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;


public class CnDictionary {

private final String COMMON_WORD_DIC_PATH = "common.dic";

/**
* This text data is for character statistic. Change to your own if you
* like.
*/

private final String COMMON_LETTER_RESOURCE_PATH = "text.dat";

private Set<String> dictionary = new HashSet<String>();

private CountMap<Character> letterCountMap = new CountMap<Character>();

private int totalLetterCount;

private static CnDictionary instance;
//单例模式
public static CnDictionary Instance() {
if (null == instance) {
try {
instance = new CnDictionary();
} catch (IOException e) {
e.printStackTrace();
}
}
return instance;
}

private CnDictionary() throws IOException {
initWordDic();
initLetterCountMap();
}

private void initLetterCountMap() throws IOException {
String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat
final int len = letterResource.length();
char c;
for (int i = 0; i < len; i++) {
c = letterResource.charAt(i);
if (TextUtils.isCnLetter(c)) {
letterCountMap.increase(c);
}
}
totalLetterCount = letterCountMap.count();

}

private void initWordDic() throws IOException {

String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic
final int len = bytes.length();
String s = "";
char c;
for (int i = 0; i < len; i++) {
c = bytes.charAt(i);

if ('\n' == c || '\r' == c || 0 == c) {
if (!TextUtils.isBlank(s)) {
dictionary.add(s.trim());
}
s = "";
} else {
s += c;
}
if (0 == c) {
break;
}
}
}

public boolean contains(String word) {
return dictionary.contains(word);
}

public double rate(char c) {
return (double) letterCountMap.get(c) / totalLetterCount;
}

public int size() {
return dictionary.size();
}
}

evolution

EntropyJudger.java

计算熵值

package grid.text.evolution;

import grid.common.CountMap;
import grid.common.TextUtils;
import grid.text.index.Pos;
import grid.text.index.TextIndexer;

public class EntropyJudger {

private TextIndexer indexer;

/**
* A word least appeared count
*/

private static int LEAST_COUNT_THRESHOLD = 5;

/**
* Threshold for solid rate calculated by word appeared count and every
* single letter.
*
* The smaller this values is, more new words you will get, but with less
* accuracy. The greater this value is, less new words you will get, but
* with high accuracy.
*/

private static double SOLID_RATE_THRESHOLD = 0.018;

/**
* Threshold for entropy value calculated by candidate word prefix character
* count and suffix character count
*
* The smaller this values is, more new words you will get, but with less
* accuracy. The greater this value is, less new words you will get, but
* with high accuracy.
*/

private static double ENTROPY_THRESHOL = 1.92;

public EntropyJudger(TextIndexer indexer) {
this.indexer = indexer;
}

public boolean judge(String candidate) {
double solidRate = getSolidRate(candidate);

if (solidRate < SOLID_RATE_THRESHOLD) {
return false;
}

double entropy = getEntropy(candidate);

if (entropy < ENTROPY_THRESHOL) {
return false;
}
return true;
}

private double getEntropy(String candidate) {
Pos pos = new Pos(candidate);
CountMap<Character> frontCountMap = new CountMap<Character>();
CountMap<Character> backCountMap = new CountMap<Character>();
final int candidateLen = candidate.length();
int off = 0;
char c;
double rate, frontEntropy = 0, backEntropy = 0;

while (indexer.find(pos).isFound()) {
off = pos.getPos();

c = indexer.charAt(off - 1);
if (TextUtils.isCnLetter(c)) {
frontCountMap.increase(c);
}
c = indexer.charAt(off + candidateLen);
if (TextUtils.isCnLetter(c)) {
backCountMap.increase(c);
}

}

for (char key : frontCountMap.keySet()) {
rate = (double) frontCountMap.get(key) / frontCountMap.count();
frontEntropy -= rate * Math.log(rate);
}
for (char key : backCountMap.keySet()) {
rate = (double) backCountMap.get(key) / backCountMap.count();
backEntropy -= rate * Math.log(rate);
}

return frontEntropy > backEntropy ? backEntropy : frontEntropy;

}

/**
* @param candidate
* @return
*/

public double getSolidRate(String candidate) {

final int candidateLen = candidate.length();

if (candidateLen < 2) {
return 1;
}

final int count = indexer.count(candidate);
double rate = 1;

if (count < LEAST_COUNT_THRESHOLD) {
return 0;
}

for (int i = 0; i < candidateLen; i++) {
rate *= (double) count / indexer.count("" + candidate.charAt(i));
}

return Math.pow(rate, 1D / candidateLen) * Math.sqrt(candidateLen);
}

public void setIndexer(TextIndexer indexer) {
this.indexer = indexer;
}

}

NewWordDiscover.java

抽词程序

package grid.text.evolution;

import grid.common.TextUtils;
import grid.text.dic.CnDictionary;
import grid.text.index.CnPreviewTextIndexer;
import grid.text.index.TextIndexer;
import grid.text.selector.CnTextSelector;
import grid.text.selector.TextSelector;

import java.util.HashSet;
import java.util.Set;

public class NewWordDiscover {

private CnDictionary dictionary;

/**
* Minimum word length
*/

private final static int MIN_CANDIDATE_LEN = 2;

/**
* Maximum word length
*/

private final static int MAX_CANDIDATE_LEN = 6;

private static Set<Character> structuralLetterSet = new HashSet<Character>();

private static char[] structuralLetters = { '我', '你', '您', '他', '她', '谁',
'哪', '那', '这', '的', '了', '着', '也', '是', '有', '不', '在', '与', '呢',
'啊', '呀', '吧', '嗯', '哦', '哈', '呐' };

static {
for (char c : structuralLetters) {
structuralLetterSet.add(c);
}
}

public NewWordDiscover() {
dictionary = CnDictionary.Instance();
}

/**
* New word discover is based on statistic and entropy, better to sure
* document size is in 100kb level, or you may get a unsatisfied result.
*
* @param document
* @return
*/

public Set<String> discover(String document) {

Set<String> set = new HashSet<String>();
TextIndexer indexer = new CnPreviewTextIndexer(document);
TextSelector selector = new CnTextSelector(document, MIN_CANDIDATE_LEN,
MAX_CANDIDATE_LEN);
EntropyJudger judger = new EntropyJudger(indexer);
String candidate;
while (!selector.end()) {
candidate = selector.next();
if (TextUtils.isBlank(candidate)) {
continue;
}
if (structuralLetterSet.contains(candidate.charAt(0))
|| structuralLetterSet.contains(candidate.charAt(candidate
.length() - 1))) {
continue;
}
// Replace IF clause with "set.contains(candidate)" if you want to
// find new word without any dictionary
if (dictionary.contains(candidate) || set.contains(candidate)) {
selector.select();
} else if (judger.judge(candidate)) {
set.add(candidate);
}
}
return set;
}
}

index

这几个类用于给词创建索引,方便从词典中找出

CnPreviewTextIndexer.java

package grid.text.index;

import grid.common.TextUtils;

import java.util.HashMap;
import java.util.Map;
import java.util.Vector;

public class CnPreviewTextIndexer implements TextIndexer {

private final static int CN_LETTER_COUNT = 5021;

private String document;

private Map<Character, Vector<Integer>> posMap;

public CnPreviewTextIndexer(String document) {
this.document = document;
init();
}

private void init() {
final int len = document.length();

final int supposedMinCount = 1 + (int) Math.log(len / CN_LETTER_COUNT
+ 1);

char c;

Vector<Integer> posVector;

posMap = new HashMap<Character, Vector<Integer>>(CN_LETTER_COUNT);

for (int i = 0; i < len; i++) {
c = document.charAt(i);
if (!TextUtils.isCnLetter(c)) {
continue;
}
posVector = posMap.get(c);
if (null == posVector) {
posVector = new Vector<Integer>(supposedMinCount);
posMap.put(c, posVector);
}
posVector.add(i);
}
}

@Override
public int count(String text) {

if (TextUtils.isBlank(text)) {
return 0;
}

Vector<Integer> vector = posMap.get(text.charAt(0));

if (null == vector) {
return 0;
}

if (1 == text.length()) {
return vector.size();
}

final int size = vector.size();
int count = 0;

for (int i = 0; i < size; i++) {
if (TextUtils.match(document, vector.get(i), text)) {
count++;
}
}

return count;
}

@Override
public Pos find(Pos pos) {
String text = pos.getTarget();

pos.setFound(false);

if (TextUtils.isBlank(text)) {
return pos;
}

Vector<Integer> vector = posMap.get(text.charAt(0));

if (null == vector) {
return pos;
}

final int arraySize = vector.size();
final int arrayIndex = pos.arrayIndex + 1;

for (int i = arrayIndex; i < arraySize; i++) {
if (TextUtils.match(document, vector.get(i), text)) {
pos.setFound(true);
pos.setPos(vector.get(i));
pos.arrayIndex = i;
break;
}
}

return pos;
}

@Override
public int len() {
return document.length();
}

@Override
public String sub(int off, int len) {
if (off < 0 || off + len >= document.length()) {
return "";
}
return document.substring(off, off + len);
}

@Override
public char charAt(int index) {
if (index < 0 || index >= document.length()) {
return 0;
}
return document.charAt(index);
}
}

Pos.java

package grid.text.index;


public class Pos {
private String target;

/**
* Pos for current matched full target text
*/

private int pos = -1;

/**
* Index in position array for current matched full target text
*/

int arrayIndex = -1;

private boolean found = false;

public Pos(String target) {
this.target = target;
}

public String getTarget() {
return target;
}

public int getPos() {
return pos;
}

public boolean isFound() {
return found;
}

void setPos(int pos) {
this.pos = pos;
}

void setFound(boolean found) {
this.found = found;
}
}

SimpleTextIndexer.java

package grid.text.index;


public class SimpleTextIndexer implements TextIndexer {

private String document;

public SimpleTextIndexer(String document) {
this.document = document;
}

@Override
public int count(String text) {
int off = 0;
int count = 0;
final int len = text.length();
while ((off = document.indexOf(text, off)) > -1) {
count++;
off += len;
}
return count;
}

@Override
public Pos find(Pos pos) {
final String text = pos.getTarget();
final int len = text.length();
int off = pos.getPos() + len;
if (pos.getPos() < 0)
off = 0;

pos.setFound(false);

if ((off = document.indexOf(text, off)) > -1) {
pos.setFound(true);
pos.setPos(off);
}
return pos;
}

@Override
public int len() {
return document.length();
}

@Override
public String sub(int off, int len) {
return document.substring(off, off + len);
}

@Override
public char charAt(int index) {
if (index < 0 || index >= document.length()) {
return 0;
}
return document.charAt(index);
}
}

TextIndexer.java

package grid.text.index;


public interface TextIndexer {

/**
* @param text
* @return count for specific text
*/
public int count(String text);

/**
* @param pos
* @return next position for current pos
*/
public Pos find(Pos pos);

/**
* @return original document length
*/
public int len();

/**
* @param off
* @param len
* @return the sub string start from <b>off</b> and with a length with
* <b>len</b>
*/
public String sub(int off, int len);

/**
* @param index
* @return return the character in the specified index
*/
public char charAt(int index);
}

participle

分词处理,具体看实现

Chunk.java

package grid.text.participle;

import grid.text.dic.CnDictionary;

import java.util.List;


public class Chunk implements Comparable<Chunk> {

private List<String> list;

private int len = 0;

private double avg = 0;

private double variance = 0;

public Chunk(List<String> list) {
this.list = list;
init();
}

private void init() {

for (String s : list) {
len += s.length();
}
avg = (double) len / list.size();

for (String s : list) {
variance += Math.pow(avg - s.length(), 2);
}
variance = Math.sqrt(variance);
}

public int getLen() {
return len;
}

public double getAvg() {
return avg;
}

public double getVariance() {
return variance;
}

public String getHead() {
if (null == list || list.isEmpty()) {
return "";
}
return list.get(0);
}

private int compareDouble(double d1, double d2) {
if (d1 - d2 < -0.0000001D) {
return 1;
} else if (d1 - d2 > 0.0000001D) {
return -1;
}
return 0;
}

@Override
public int compareTo(Chunk o) {


if (len != o.len) {
return o.len - len;
}

int d = compareDouble(avg, o.avg);
if (0 != d) {
return d;
}

d = compareDouble(variance, o.variance);
if (0 != d) {
return d;
}

CnDictionary dictionary = CnDictionary.Instance();

double rateSrc = 0, rateDest = 0;
for (String s : list) {
if (1 == s.length()) {
rateSrc += dictionary.rate(s.charAt(0));
}
}
for (String s : o.list) {
if (1 == s.length()) {
rateDest += dictionary.rate(s.charAt(0));
}
}
return compareDouble(rateSrc, rateDest);
}

public String toString() {
return list.toString();
}
}

ChunkStream.java

package grid.text.participle;

import grid.common.Node;
import grid.common.TextUtils;
import grid.common.Tree;
import grid.text.dic.CnDictionary;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class ChunkStream {

/**
* Define the max supposed word length
*
* You could shorten the value if you don't need too long participle result
*/

private static final int MAX_WORD_LEN = 7;

/**
* Define the predict level while execute participle.
*
* Negligible accuracy will be promoted if you increase this value
*/

private static final int PREDICT_LEVEL = 3;

private static CnDictionary dictionary = CnDictionary.Instance();

public String next(String text, int off) {
Tree<String> root = new Tree<String>("ROOT");
recurse(root, off, text, 0);
List<Node<String>> list = root.getLeaves();
List<Chunk> chunkList = new ArrayList<Chunk>();
for (Node<String> node : list) {
chunkList.add(new Chunk(node.getBranchPath()));
}
Collections.sort(chunkList);
return chunkList.get(0).getHead();

}

private void recurse(Node<String> node, int off, String text,
int predictDeep) {
int len = MAX_WORD_LEN + off > text.length() ? text.length() - off
: MAX_WORD_LEN;

while (predictDeep < PREDICT_LEVEL) {
if (len < 1) {
return;
}

String s = text.substring(off, off + len);
if (len < 2) {
if (!TextUtils.isCnLetter(text.charAt(off))) {
break;
}
recurse(node.add(s), off + 1, text, predictDeep + 1);
} else if (dictionary.contains(s)) {
recurse(node.add(s), off + s.length(), text, predictDeep + 1);
}
len--;
}
}
}

MechanicalParticiple.java

package grid.text.participle;

import grid.common.TextUtils;

import java.util.Vector;


public class MechanicalParticiple {

public Vector<String> partition(String document) {
Vector<String> vector = new Vector<String>();
final int docLen = document.length();
int off = 0;
char c;
String seg = "";
ChunkStream stream = new ChunkStream();

while (off < docLen) {
c = document.charAt(off);
if (TextUtils.isEnLetter(c) || TextUtils.isNumeric(c)) {
seg += c;
off++;
} else if (TextUtils.isCnLetter(c)) {
if (!TextUtils.isBlank(seg)) {
vector.add(seg);
seg = "";
}
String word = stream.next(document, off);
if (!TextUtils.isBlank(word)) {
vector.add(word);
off += word.length();
}
} else {
if (!TextUtils.isBlank(seg)) {
vector.add(seg);
seg = "";
}

/**
* TODO: Uncomment the "ELSE IF" clause if you would like to
* reserve punctuations
*/


// else if (!TextUtils.isBlank("" + c)) { vector.add("" + c); }

off++;
}
}
if (!TextUtils.isBlank(seg)) {
vector.add(seg);
}
return vector;

}
}

selector

文本选择器,筛选出可能为新词的词汇

CnTextSelector.java

package grid.text.selector;

import grid.common.TextUtils;


public class CnTextSelector extends CommonTextSelector {

public CnTextSelector(String document, int minSelectLen, int maxSelectLen) {
super(document, minSelectLen, maxSelectLen);
}

protected void adjustCurLen() {
while (pos < docLen && !TextUtils.isCnLetter(document.charAt(pos))) {
pos++;
}
for (int i = 0; i < maxSelectLen && pos + i < docLen; i++) {
if (!TextUtils.isCnLetter(document.charAt(pos + i))) {
curLen = i;
if (curLen < minSelectLen) {
pos++;
adjustCurLen();
}
return;
}
}

curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen;
}
}

CommonTextSelector.java

package grid.text.selector;


public class CommonTextSelector implements TextSelector {

protected String document;

protected int pos = 0;

protected int maxSelectLen = 5;

protected int minSelectLen = 2;

protected int curLen;

protected final int docLen;

public CommonTextSelector(String document, int minSelectLen,
int maxSelectLen)
{

this.document = document;
this.minSelectLen = minSelectLen;
this.maxSelectLen = maxSelectLen;
docLen = document.length();
adjustCurLen();
}

public void select() {
pos += ++curLen;
adjustCurLen();
}

protected void adjustCurLen() {
curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen;
}

public String next() {
if (curLen < minSelectLen) {
pos++;
adjustCurLen();
}

if (pos + curLen <= docLen && curLen >= minSelectLen) {
return document.substring(pos, pos + curLen--);
} else {
curLen--;
// return document.substring(pos, docLen);
return "";
}
}

public boolean end() {
return curLen < minSelectLen && curLen + pos >= docLen - 1;
}

@Override
public int getCurPos() {
return pos;
}
}

TextSelector.java

package grid.text.selector;


public interface TextSelector {
public boolean end();

public void select();

public String next();

public int getCurPos();

}

测试代码

NewWordDiscoverTest.java

package grid.test;

import grid.common.TextDatReader;
import grid.text.evolution.NewWordDiscover;
import grid.text.index.CnPreviewTextIndexer;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Scanner;
import java.util.Set;

public class NewWordDiscoverTest {
public static void writefile(String m) {

try {
File file = new File("result.txt");
if (!file.exists()) {
file.createNewFile();
}
FileWriter fileWritter = new FileWriter(file.getName(), true);
BufferedWriter bufferWritter = new BufferedWriter(fileWritter);
bufferWritter.write(m);
bufferWritter.close();

} catch (IOException e) {
e.printStackTrace();
}
}

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
// 开始之前,清空result.txt,避免数据重复
File filere = new File("result.txt");
filere.delete();

Scanner scan = new Scanner(System.in);
System.out.println("请输入您要处理的文件名称:\n");
String path = scan.next();
File file = new File(path);
if (!file.exists() || (!file.isFile())) {
throw new Exception("指定文件不存在!");
}
long maxsize = 1024 * 1024 * 1024;// 1G,超过这个值需要做文件切分
long size = 1024 * 1024 * 5; // 子文件最大为100M
long fileLength = file.length();
if (size <= 0) {
size = fileLength / 2;
}
// 取得被分割后的小文件的数目
int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
: (int) (fileLength / size);
if (file.length() >= maxsize) {
System.out.println("文件大小超出1G,是否开始进行文件切割?1:是 0:否\n");

int t = scan.nextInt();
if (t == 1) {
TextDatReader.divide(path, size);
System.out.println("切割完成\n");
System.out.println("结果保存在当前目录下的dat文件夹中\n");

}
// System.out.println("请输入您要处理的文件序号,例如1代表dat文件架下的text1.dat\n");
// int m = scans.nextInt();
for (int m = 1; m <= num; m++) {
String pathdived = "./dat/text" + m + ".dat";
System.out.println("开始提取第" + m + "个文件……");
discovrWord(pathdived);
}

} else {
System.out.println("开始提取文件……");
discovrWord(path);
}
}

private static void discovrWord(String path) throws IOException {
String document = TextDatReader.read(path);
NewWordDiscover discover = new NewWordDiscover();
Set<String> words = discover.discover(document);
CnPreviewTextIndexer ci = new CnPreviewTextIndexer(document);
// long start = System.currentTimeMillis();
// System.out.println("耗时: " + (double) document.length()
// / (System.currentTimeMillis() - start) * 1000);
System.out.println("新词个数: " + words.size());
System.out.println("发现的新词:" + "\n");
for (String newword : words) {
System.out.println(newword + "," + ci.count(newword) + "\n");// 发现新词后,统计每个新词出现的次数
writefile(newword + "," + ci.count(newword) + "\n");
}
}
}

抽词测试,结果如下

ParticipleTest.java

package grid.test;

import grid.text.participle.MechanicalParticiple;

import java.util.Vector;


public class ParticipleTest {

private static String document = "我是中国人";

public static void main(String args[]) {
MechanicalParticiple participle = new MechanicalParticiple();
Vector<String> vec = participle.partition(document);
System.out.println(vec);
}
}

分词测试,结果如下

怎么样,很酷吧,你还可以试着用《天龙八部》数据集玩下,看看主角是不是乔帮主。如果发现了什么新鲜词,请告诉博主,咱也不落后哈!

VIP独享–天龙八部新词,如果想看结果请心里默夸博主一百次

执行以上步骤后再送您一份哈利波特版的

文章目录
  1. 1. 项目结构
  2. 2. common
    1. 2.1. CountMap.java
    2. 2.2. Node.java
    3. 2.3. TextDatReader.java
    4. 2.4. TextUtils.java
    5. 2.5. Tree.java
  3. 3. dic
    1. 3.1. CnDictionary.java
  4. 4. evolution
    1. 4.1. EntropyJudger.java
    2. 4.2. NewWordDiscover.java
  5. 5. index
    1. 5.1. CnPreviewTextIndexer.java
    2. 5.2. Pos.java
    3. 5.3. SimpleTextIndexer.java
    4. 5.4. TextIndexer.java
  6. 6. participle
    1. 6.1. Chunk.java
    2. 6.2. ChunkStream.java
    3. 6.3. MechanicalParticiple.java
  7. 7. selector
    1. 7.1. CnTextSelector.java
    2. 7.2. CommonTextSelector.java
    3. 7.3. TextSelector.java
  8. 8. 测试代码
    1. 8.1. NewWordDiscoverTest.java
    2. 8.2. ParticipleTest.java