创建maven项目pom.xml文件
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.pactera</groupId> <artifactId>pactera-lucene</artifactId> <version>0.0.1-SNAPSHOT</version> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.10</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.6.4</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>4.10.2</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>4.10.2</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>4.10.2</version> </dependency> <dependency> <groupId>cn.itcast.lucene.analyzer</groupId> <artifactId>ik-analyzer</artifactId> <version>2012-4.x</version> </dependency> </dependencies> </project>测试
//测试创建索引 @Test public void testIndexWriter() throws IOException{ //创建索引目录 Directory directory = FSDirectory.open(new File("d:\\directory")); //创建标准分词器 Analyzer analyzer = new StandardAnalyzer(); //索引配置 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_10_2, analyzer); indexWriterConfig.setOpenMode(OpenMode.CREATE); //写索引 IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig); //创建文档对象 Document doc = new Document(); doc.add(new IntField("id", 18, Store.YES)); doc.add(new TextField("title", "我们都是党的接班人yes or no?", Store.YES)); doc.add(new LongField("price", 6388L, Store.YES)); doc.add(new StringField("pic", "www.baidu.com", Store.YES)); //添加文档 indexWriter.addDocument(doc); indexWriter.commit(); indexWriter.close(); }在指定索引目录下查看索引
这种文件可以通过两种方式查看 第一种使用工具
用工具打开指定的目录就可以看到是怎么创建索引的
使用标准分词器汉字按单个字全部被拆分了
用lucene提供的TokenStream查看
@Test public void testTokenStream() throws IOException{ //创建标准分词器 Analyzer analyzer = new StandardAnalyzer(); //词汇列表 TokenStream tokenStream = analyzer.tokenStream("title", "我们都是党的接班人yes or no?"); //tokenStream指针指向开始位置 tokenStream.reset(); //设置分词偏移量引用 OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); //设置分词词语引用 CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); //遍历词汇列表 while(tokenStream.incrementToken()){ //分词开始位置 System.out.println("分词开始位置:" + offsetAttribute.startOffset()); //分词词语 System.out.println("最小分词单元:" + charTermAttribute); //分词结束位置 System.out.println("分词结束位置:" + offsetAttribute.endOffset()); } }结果