Commit c831120b authored by Ho Yin Chan's avatar Ho Yin Chan
Browse files

trunk:egs/hkust a vanilla chinese segmenter

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3222 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent a9d67d47
This diff is collapsed.
This diff is collapsed.
Manifest-Version: 1.0
Class-Path: .
Main-Class: ChiUtf8Segmenter
JAVA_COMPILER=javac
SOURCE_DIR=.
BIN_DIR=.
JAR_FILE=ChiUtf8Segmenter.jar
SOURCES = $(wildcard $(SOURCE_DIR)/*.java)
CLASSES = $(patsubst $(SOURCE_DIR)/%.java, $(BIN_DIR)/%.class, $(SOURCES))
all: $(JAR_FILE)
$(JAR_FILE): $(CLASSES)
jar -cmf MANIFEST.MF $(JAR_FILE) -C $(BIN_DIR) .
chmod +x $(JAR_FILE)
$(BIN_DIR)/%.class: $(SOURCE_DIR)/%.java
$(JAVA_COMPILER) -d $(BIN_DIR) $(SOURCE_DIR)/*.java
clean:
rm -f $(BIN_DIR)/*.class
rm -f $(JAR_FILE)
//
// Copyright 2013-2014, Hong Kong University of Science and Technology (author: Ricky Chan Ho Yin)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import java.lang.*;
import java.util.*;
// class for search history path storage
public class SearchHistoryPath {
private int number_element;
private ArrayList<String> element = null;
private float log_prob;
public SearchHistoryPath() {
number_element = 0;
element = new ArrayList<String>();
log_prob = 0.0f;
}
public void addElement(String strVal, float strProb) {
number_element++;
element.add(strVal);
log_prob+=strProb;
}
public int getNumElement() {
return number_element;
}
public float getLogProb() {
return log_prob;
}
public ArrayList<String> getList() {
return element;
}
public void setList(ArrayList<String> element_path) {
element.clear();
ListIterator<String> listIterator = element_path.listIterator();
while (listIterator.hasNext()) {
element.add(listIterator.next());
}
number_element = element.size();
}
public void clear() {
number_element = 0;
element.clear();
element = null;
log_prob = 0.0f;
}
}
//
// Copyright 2013-2014, Hong Kong University of Science and Technology (author: Ricky Chan Ho Yin)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import java.io.*;
import java.lang.*;
import java.util.*;
import java.util.regex.*;
// class for wordlist and corresponding log probabilities(or cost in negative values for segmentation)
class WordProbMap {
private String mapName = "wordprobmap";
private String encoding = "UTF-8";
private HashMap<String, Float> probmap = null;
public void setName(String mapName) { this.mapName = mapName; }
public void setEncoding(String encoding) { this.encoding = encoding; }
public String getName() { return mapName; }
public String getEncoding() { return encoding; }
public HashMap<String, Float> getProbMap() { return probmap; }
public WordProbMap() throws IOException {
if(readWordProbMap()==false) throw new IOException("read wordprobmap error in WordProbMap.java\n");
}
public WordProbMap(String wordMapFile, String encoding) throws IOException {
setName(wordMapFile);
setEncoding(encoding);
if(readWordProbMap()==false) throw new IOException("read wordprobmap: " + wordMapFile + " error in WordProbMap.java\n");
}
public void clearMap() {
if(probmap != null) {
probmap.clear();
probmap = null;
}
}
private boolean readWordProbMap() {
try {
FileInputStream fin = new FileInputStream(mapName);
BufferedReader rd = new BufferedReader(new InputStreamReader(fin, encoding));
probmap = new HashMap<String, Float>();
Pattern p = Pattern.compile("[ \t\r\n]+");
String [] b;
int line_num = 0;
String a = rd.readLine();
while(a != null) {
line_num++;
b = p.split(a);
if(b.length == 0) {
continue; // empty line
}
else if(b.length != 2) {
throw new IOException("read wordprobmap: "+mapName+" error in line "+line_num+"\n");
}
if(probmap.containsKey(b[0]) && probmap.get(b[0])>Float.valueOf(b[1]) ) { // appear multiple times, choose max
a = rd.readLine();
continue;
}
probmap.put(b[0], Float.valueOf(b[1]));
a = rd.readLine();
}
fin.close();
rd.close();
}
catch (IOException e) {
System.err.println(e);
return false;
}
return true;
}
}
<s></s>
<s>What is it?</s>
Give me a number!!
123.89%%??
<s>Hello!!Friends!!</s>
一二三四点五六
89.567
123健康贴士
全球约20%人口使用社交媒体
Facebook倚重移动平台大力发展亚洲市场
应用商店助力 智能电视将淘汰传统有线电视
感受美国最受欢迎小镇魅力
一睹全球十大自然奇观
德国韩国泰国新加坡马尔代夫东京首尔巴厘岛迪拜济州岛北海道巴黎普罗旺斯柏林巴塞罗那伦敦纽约旧金山夏威夷日本马来西亚南非瑞士法国英国澳大利亚加拿大美国
香港的天气怎么样
上海的天气怎么样
下周二下午会不会很热
什么时候会放晴
今天下午有没有下雨
今天北京的天气如何
今天天气是阴天吗
今天纽约会不会有雪
伦敦的天气
你是哪里人
欧洲天气预报
<s> </s>
<s> What is it ? </s>
Give me a number ! !
123.89 % % ? ?
<s> Hello ! ! Friends ! ! </s>
一二三四点五六
89.567
123 健康 贴士
全球 约 20 % 人口 使用 社交 媒体
Facebook 倚重 移动 平台 大力 发展 亚洲 市场
应用 商店 助力 智能 电视 将 淘汰 传统 有线电视
感受 美国 最 受 欢迎 小镇 魅力 诱惑
一睹 全球 十 大自然 奇观
德国 韩国 泰国 新加坡 马尔代夫 东京 首尔 巴厘岛 迪拜 济州岛 北海道 巴黎 普罗旺斯 柏林 巴塞罗那 伦敦 纽约 旧金山 夏威夷 日本 马来西亚 南非 瑞士 法国 英国 澳大利亚 加拿大 美国
香港 的 天气 怎么样
上海 的 天气 怎么样
下 周二 下午 会 不 会 很 热
什么 时候 会 放晴
今天 下午 有 没有 下雨
今天 北京 的 天气 如何
今天 天气 是 阴天 吗
今天 纽约 会 不 会 有 雪
伦敦 的 天气
你 是 哪里 人
欧洲 天气 预报
<s> </s>
<s> What is it ? </s>
Give me a number ! !
123.89 % % ? ?
<s> Hello ! ! Friends ! ! </s>
一二三四 点 五六
89 . 567
123 健康 贴士
全球 约 20 % 人口 使用 社交 媒体
Facebook 倚重 移动 平台 大力 发展 亚洲 市场
应用 商店 助力 智能 电视 将 淘汰 传统 有线电视
感受 美国 最 受 欢迎 小镇 魅力 诱惑
一睹 全球 十 大自然 奇观
德国 韩国 泰国 新加坡 马尔代夫 东京 首尔 巴厘岛 迪拜 济州岛 北海道 巴黎 普罗旺斯 柏林 巴塞罗那 伦敦 纽约 旧金山 夏威夷 日本 马来西亚 南非 瑞士 法国 英国 澳大利亚 加拿大 美国
香港 的 天气 怎么样
上海 的 天气 怎么样
下 周二 下午 会 不 会 很 热
什么 时候 会 放晴
今天 下午 有 没有 下雨
今天 北京 的 天气 如何
今天 天气 是 阴天 吗
今天 纽约 会 不 会 有 雪
伦敦 的 天气
你 是 哪里 人
欧洲 天气 预报
剑桥 、 昆西 、 牛顿 、 萨默 维尔 、 里 维尔 和 切尔西 等 城市
# clean and build
make clean
make all
# print command prompt
java -jar ChiUtf8Segmenter.jar
# example
java -jar ChiUtf8Segmenter.jar -mode5 example/test_sent.txt 186k_wordprobmap
mv example/test_sent.txt.seg example/test_sent.txt.seg0
# another example
java -jar ChiUtf8Segmenter.jar -mode5 example/test_sent.txt 186k_wordprobmap snumbers_u8.txt
亿
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment