lucene怎么使用nlpir进行分词
发布网友
发布时间:2022-04-23 06:47
我来回答
共2个回答
热心网友
时间:2022-06-16 23:56
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Runtime.InteropServices;
using Lucene.Net.Analysis;
namespace Lucene.Net.Analysis.DChinese
{
[StructLayout(LayoutKind.Explicit)]
public struct result_t
{
[FieldOffset(0)]
public int start;
[FieldOffset(4)]
public int length;
[FieldOffset(8)]
public int sPos1;
[FieldOffset(12)]
public int sPos2;
[FieldOffset(16)]
public int sPos3;
[FieldOffset(20)]
public int sPos4;
[FieldOffset(24)]
public int sPos5;
[FieldOffset(28)]
public int sPos
热心网友
时间:2022-06-16 23:56
词法分析是lucene的一个模块,lucene自带的中文分词器(analyzer)一般效果不是很理想。现在项目中用的分词工具是北理工的NLPIR,但是NLPIR没有一个现成的lucene分词器(analyzer)实现类。这里就需要自己来写一个比较简短的基于NLPIR的analyzer实现类。
不同的Analyzer就是组合不同的Tokenizer和TokenFilter得到最后的TokenStream。以StandardAnalyzer为例。以下代码截图来自StandardAnalyzer源码。红框中可以看到TokenFilter使用的是装饰者模式,Tokenizer和TokenFilter都继承自TokenStream类。
import com.sun.jna.Library;
import com.sun.jna.Native;
public class NLPTool {
public interface CLibrary extends Library {
// 定义并初始化接口的静态变量
CLibrary Instance = (CLibrary) Native.loadLibrary("code/NLPIR",
CLibrary.class);
public int NLPIR_Init(String sDataPath, int encoding,
String sLicenceCode);
public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged);
public String NLPIR_GetKeyWords(String sLine, int nMaxKeyLimit,
boolean bWeightOut);
public String NLPIR_GetFileKeyWords(String sLine, int nMaxKeyLimit,
boolean bWeightOut);
public int NLPIR_AddUserWord(String sWord);// add by qp 2008.11.10
public int NLPIR_DelUsrWord(String sWord);// add by qp 2008.11.10
public String NLPIR_GetLastErrorMsg();
public void NLPIR_Exit();
}
public static String SegAndPos(String sInput,int type) {
String argu = ".";
String nativeBytes = "";
int charset_type = 1;
CLibrary.Instance.NLPIR_Init(argu, charset_type, "0");
try {
// CLibrary.Instance.NLPIR_AddUserWord("奇虎360 nt");
nativeBytes = CLibrary.Instance.NLPIR_ParagraphProcess(sInput, type); //第二个参数为1表示,进行词性标注。
CLibrary.Instance.NLPIR_Exit();
} catch (Exception ex) {
ex.printStackTrace();
}
return nativeBytes;
}
}
[java] view plain copy
import java.io.Reader;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.Version;
/*
*@author:xyd
*@department:CasCeep
*/
public class MyChineseTokenizer extends CharTokenizer {
public MyChineseTokenizer(Reader in) {
super(Version.LUCENE_47, in);
}
public MyChineseTokenizer(AttributeFactory factory, Reader in) {
super(Version.LUCENE_47, in);
}
/*
* @see org.apache.lucene.analysis.util.CharTokenizer#isTokenChar(int)
*/
@Override
protected boolean isTokenChar(int c) {
return !Character.isWhitespace(c);
}
}
[java] view plain copy
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import lucene.NLPTool;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/*
*@author:xyd
*@department:CasCeep
*/
public final class MyChineseAnalyzer extends Analyzer {
private CharArraySet stopWords;
/**
* An array containing some common English words that are not usually useful
* for searching.
*/
public static final String[] CHINESE_ENGLISH_STOP_WORDS = {"我", "的" };
/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
public MyChineseAnalyzer() {
this.stopWords = StopFilter.makeStopSet(Version.LUCENE_47,
CHINESE_ENGLISH_STOP_WORDS);
}
@Override
protected TokenStreamComponents createComponents(String arg0, Reader reader) {
// TODO Auto-generated method stub
BufferedReader br = new BufferedReader(reader);
Tokenizer tokenizer = null;
TokenStream tokFilter = null;
try {
String text = br.readLine();
String string = NLPTool.SegAndPos(text, 0);
// 分词中间加入了空格
tokenizer = new MyChineseTokenizer(new StringReader(string));
tokFilter = new StandardFilter(Version.LUCENE_47, tokenizer);
tokFilter = new LowerCaseFilter(Version.LUCENE_47, tokFilter);
// 使用stopWords进行过滤
tokFilter = new StopFilter(Version.LUCENE_47, tokFilter, stopWords);
} catch (IOException e) {
e.printStackTrace();
}
return new TokenStreamComponents(tokenizer, tokFilter);
}
public static void main(String[] args) throws IOException {
String string = "我的老师在中国科学院工作";
Analyzer analyzer = new MyChineseAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("field",
new StringReader(string));
tokenStream.reset();
while (tokenStream.incrementToken()) {
// 文本属性
CharTermAttribute attribute = tokenStream
.addAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenStream
.addAttribute(OffsetAttribute.class);
// 距离增加量
PositionIncrementAttribute positionAttr = tokenStream
.addAttribute(PositionIncrementAttribute.class);
// 距离
PositionLengthAttribute posL = tokenStream
.addAttribute(PositionLengthAttribute.class);
// 词性
TypeAttribute typeAttr = tokenStream
.addAttribute(TypeAttribute.class);
System.out.println(offsetAtt.startOffset() + ":"
+ offsetAtt.endOffset() + "\t" + attribute + "\t"
+ typeAttr.type() + "\t" + posL.getPositionLength() + "\t"
+ positionAttr.getPositionIncrement());
}
}