42lucene和机器学习进行全文搜索,并排序

Posted 香港胖仔

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了42lucene和机器学习进行全文搜索,并排序相关的知识,希望对你有一定的参考价值。

package com.lucene.test;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.joone.engine.FullSynapse;
import org.joone.engine.LinearLayer;
import org.joone.engine.Monitor;
import org.joone.engine.NeuralNetEvent;
import org.joone.engine.NeuralNetListener;
import org.joone.engine.SigmoidLayer;
import org.joone.engine.learning.TeachingSynapse;
import org.joone.io.MemoryInputSynapse;
import org.joone.io.MemoryOutputSynapse;
import org.joone.net.NeuralNet;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.lucene.domain.Article;

public class TestLucene implements NeuralNetListener{
    private NeuralNet nnet = null;
    private MemoryInputSynapse inputSynapse,desireOutputSynapse;
    LinearLayer input;
    SigmoidLayer hidden,output;
    boolean singleThreadMode = true;
    
    //XOR input
    private double[][] inputArray = new double[][]{
            {0.0,0.0},
            {0.0,1.0},
            {1.0,0.0},
            {1.0,1.0}
    };
    
    //XOR desired output
    private double[][] desiredOutputArray = new double[][]{
            {0.0},
            {1.0},
            {1.0},
            {1.0}
    };
    
    /**
     * 创建索引
     * @throws Exception
     */
    @Test
    public void testCreateIndex() throws Exception{
        int fileNum = 1;
        List<String> contents = new ArrayList<String>();
        InputStream inputStream = null;
        String value = null;
        File directory = new File("./20_newsgroups");
        if(directory.isDirectory()){
            File[] files = directory.listFiles();
            for (int i = 0; i < 1; i++) {
                if(files[i].isDirectory()){
                    File[] subFiles = files[i].listFiles();
                    for (int j = 0; j < 10; j++) {
                        inputStream = new BufferedInputStream(new FileInputStream(subFiles[j]));
                        StringBuffer tempContent = new StringBuffer();
                        byte[] bytes = new byte[1024*10];
                        int len = 0;
                        while((len = inputStream.read(bytes))!=-1){
                            tempContent = tempContent.append(new String(bytes));
                        }
                        value = tempContent.toString();
                        System.out.println(value);
                        inputStream.close();
                        Article article = new Article(fileNum,subFiles[j].getName(),tempContent.toString());
                        Directory saveDirectory = FSDirectory.open(Paths.get("./indexDir/"));
                        //分词器
                        Analyzer analyzer = new WhitespaceAnalyzer();
                        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
                        Document doc = new Document();
                        doc.add(new TextField("id", article.getId().toString(), Store.YES));
                        doc.add(new TextField("title", article.getTitle(), Store.YES));
                        doc.add(new TextField("content", article.getContent(), Store.YES));
                        IndexWriter indexWriter = new IndexWriter(saveDirectory,iwc);
                        System.out.println("have already add file to fileDocment system"+fileNum);
                        indexWriter.addDocument(doc);
                        indexWriter.close();//释放资源
                        fileNum = fileNum+1;
                    }
                }
            }
        }
        
        //1.将需要添加的实体构造成实体对象
        Article article = new Article(1,"Lucene是全文检索框架",
                "全文检索(Full-Test Retrieval)是以文本作为检索对象,找出含有指定词汇的文本。"+
                "全面,准确和快速是衡量全文检索系统的关键指标。");
        
        //2,保存到数据库(此步骤暂时省略)
        
        //3、建立索引(lucene)
        //索引库目录
        
        
        
        //将 Article 转换为Document
        
        
        
        
        //保存到索引库中
        
    }
    
    /**
     * 测试搜索
     * @throws IOException 
     * @throws ParseException 
     */
    @Test
    public void testSearch() throws IOException, ParseException{
        //1、搜索条件
        String queryCondition = "philosophical";
        
        //2、执行搜索(lucene)
        List<Article> articles = new ArrayList<Article>();
        
        //----------搜索代码------------------------
        Directory directory = FSDirectory.open(Paths.get("./indexDir/"));
        Analyzer analyzer = new WhitespaceAnalyzer();//创建分词器
        
        //把查询字符串转换为Query对象(只在title中查询)
        QueryParser queryParser = new QueryParser("content",analyzer);
        Query query = queryParser.parse(queryCondition);
        
        //2执行搜索得到搜索结果
        IndexReader indexReader = DirectoryReader.open(directory);
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        TopDocs topDocs = indexSearcher.search(query, 100);
        
        Integer count = topDocs.totalHits;//总结果数量
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;//返回前N条结果
        
        //2.3处理结果
        for (int i = 0; i < scoreDocs.length; i++) {
            ScoreDoc scoreDoc= scoreDocs[i];
            int docId = scoreDoc.doc;
            System.out.println("得分是:"+scoreDoc.score+"内部编号是:"+docId);
            
            //根据内部编号取出真正的Document数据
            Document doc = indexSearcher.doc(docId);
            
            //将document转化为Article
            Article article = new Article(Integer.parseInt(doc.get("id")),doc.get("title"),doc.get("content"));
            articles.add(article);
        }
        
        //------------------------------------------
        //3、控制台显示结果
        System.err.print("总结果数:"+count);
        for (Article article : articles) {
            System.out.println("查询结果:ID为:"+article.getId()+",title为:"+article.getTitle());
        }
        indexSearcher.getIndexReader().close();
    }
    
    
    @Test
    public void testNeuralNet(){
        TestLucene testLucene = new TestLucene();
        testLucene.initNeuralNet();
        testLucene.train();
        testLucene.interrogate();
    }
    
    public void initNeuralNet(){
        //First create the three layers
        input = new LinearLayer();
        hidden = new SigmoidLayer();
        output = new SigmoidLayer();
        
        //set the dimensions of the layers
        input.setRows(2);
        hidden.setRows(3);
        output.setRows(1);
        
        input.setLayerName("L.input");
        hidden.setLayerName("L.hidden");
        output.setLayerName("L.output");
        
        //Now create the two Synapses
        FullSynapse synapse_IH = new FullSynapse();//input -->hidden conn
        FullSynapse synapse_HO = new FullSynapse();//hidden -->output conn
        
        //Connect the input layer whit the hidden layer
        input.addOutputSynapse(synapse_IH);
        hidden.addInputSynapse(synapse_IH);
        
        //Connect the hidden layer whit the output layer
        hidden.addOutputSynapse(synapse_HO);
        output.addInputSynapse(synapse_HO);
        
        //the input to the neural net
        inputSynapse = new MemoryInputSynapse();
        input.addInputSynapse(inputSynapse);
        
        //The Trainer and its desired output
        desireOutputSynapse = new MemoryInputSynapse();
        TeachingSynapse trainer = new TeachingSynapse();
        
        trainer.setDesired(desireOutputSynapse);
        
        //Now we add this structure to a NeuralNet object
        nnet = new NeuralNet();
        
        nnet.addLayer(input,NeuralNet.INPUT_LAYER);
        nnet.addLayer(hidden,NeuralNet.HIDDEN_LAYER);
        nnet.addLayer(output, NeuralNet.OUTPUT_LAYER);
        nnet.setTeacher(trainer);
        output.addOutputSynapse(trainer);
        nnet.addNeuralNetListener(this);
    }
    
    public void train(){
        //set the inputs
        inputSynapse.setInputArray(inputArray);
        inputSynapse.setAdvancedColumnSelector("1,2");
        //set the desired outputs
        desireOutputSynapse.setInputArray(desiredOutputArray);
        desireOutputSynapse.setAdvancedColumnSelector("1");
        //get the monitor object to train or feed forward
        Monitor monitor = nnet.getMonitor();
        
        //set the monitor parameters
        monitor.setLearningRate(0.8);
        monitor.setMomentum(0.3);
        monitor.setTrainingPatterns(inputArray.length);
        monitor.setTotCicles(5000);
        monitor.setLearning(true);
        
        long initms = System.currentTimeMillis();
        //Run the network in single-thread,synchronized mode
        nnet.getMonitor().setSingleThreadMode(singleThreadMode);
        nnet.go(true);
        System.out.println("Total time="+(System.currentTimeMillis()-initms)+"ms");
    }

    
    public void interrogate(){
        double[][] inputArray = new double[][]{
                {0.0,1.0},
                {1.0,0.0},
                {1.0,1.0},
                {0.0,0.0}
        };
        //set the inputs
        inputSynapse.setInputArray(inputArray);
        inputSynapse.setAdvancedColumnSelector("1,2");
        Monitor monitor = nnet.getMonitor();
        monitor.setTrainingPatterns(4);
        monitor.setTotCicles(1);
        monitor.setLearning(false);
        MemoryOutputSynapse memOut = new MemoryOutputSynapse();
        //set the output synapse to write the output of the net
        
        if(nnet != null){
            nnet.addOutputSynapse(memOut);  
            System.out.println(nnet.check());
            nnet.getMonitor().setSingleThreadMode(singleThreadMode);
            nnet.go();
            for (int i = 0; i < 4; i++) {
                double[] pattern = memOut.getNextPattern();
                System.out.println("Output pattern #"+(i+1)+"="+pattern[0]);
            }
            System.out.println("Interrogating Finished");
        }
    }
    
    
    public void cicleTerminated(NeuralNetEvent arg0) {
        
    }

    public void errorChanged(NeuralNetEvent e) {
        Monitor mon=(Monitor) e.getSource();
        if(mon.getCurrentCicle()%100==0){
            System.out.println("Epoch:"+(mon.getTotCicles()-mon.getCurrentCicle())+"RMSE:"
                    +mon.getGlobalError());
        }
    }

    public void netStarted(NeuralNetEvent e) {
        Monitor mon = (Monitor) e.getSource();
        System.out.println("Network started for ");
        if(mon.isLearning()){
            System.out.println("training");
        }else{
            System.out.println("interrogation.");
        }
    }

    public void netStopped(NeuralNetEvent e) {
        Monitor mon = (Monitor) e.getSource();
        System.out.println("Network stopped . Last RMSE="
                +mon.getGlobalError());
    }

    public void netStoppedError(NeuralNetEvent e, String error) {
        System.out.println("Network stopped due the following error:"
                +error);
    }
}

结果

得分是:0.25462872内部编号是:7840
得分是:0.24006625内部编号是:7841
查询结果:ID为:2,title为:51060总结果数:2
查询结果:ID为:1,title为:49960

 

以上是关于42lucene和机器学习进行全文搜索,并排序的主要内容,如果未能解决你的问题,请参考以下文章

全文检索——Lucene

Lucene学习总结

Lucene全文搜索原理与使用

《从Lucene到Elasticsearch:全文检索实战》学习笔记二

Lucene全文检索学习入门

Lucene全文检索学习入门