web-dev-qa-db-fra.com

Comment faire la saisie automatique des suggestions / suggestions dans Lucene?

Je cherche un moyen de faire la saisie automatique des suggestions/suggestions dans Lucene. J'ai cherché un peu sur Google et joué un peu, mais tous les exemples que j'ai vus semblent mettre en place des filtres dans Solr. Nous n'utilisons pas Solr et ne prévoyons pas de passer à Solr dans un avenir proche, et Solr est évidemment en train de contourner Lucene de toute façon, donc j'imagine qu'il doit y avoir un moyen de le faire!

J'ai étudié l'utilisation d'EdgeNGramFilter, et je me rends compte que je devrais exécuter le filtre sur les champs d'index et retirer les jetons, puis les comparer à la requête entrée ... J'ai du mal à faire le lien entre les deux en un peu de code, donc l'aide est très appréciée!

Pour être clair sur ce que je recherche (j'ai réalisé que je n'étais pas trop clair, désolé) - je cherche une solution où, lors de la recherche d'un terme, il retournerait une liste de requêtes suggérées. Lorsque vous saisissez "inter" dans le champ de recherche, il revient avec une liste de requêtes suggérées, telles que "Internet", "International", etc.

47
Mat Mannion

Sur la base de la réponse de @Alexandre Victoor, j'ai écrit un petit cours basé sur le correcteur orthographique Lucene dans le package contrib (et en utilisant le LuceneDictionary inclus) qui fait exactement ce que je veux.

Cela permet de réindexer à partir d'un index source unique avec un seul champ et fournit des suggestions de termes. Les résultats sont triés par le nombre de documents correspondants avec ce terme dans l'index d'origine, donc les termes les plus populaires apparaissent en premier. Semble fonctionner assez bien :)

import Java.io.IOException;
import Java.io.Reader;
import Java.util.ArrayList;
import Java.util.HashMap;
import Java.util.Iterator;
import Java.util.List;
import Java.util.Map;

import org.Apache.lucene.analysis.Analyzer;
import org.Apache.lucene.analysis.ISOLatin1AccentFilter;
import org.Apache.lucene.analysis.LowerCaseFilter;
import org.Apache.lucene.analysis.StopFilter;
import org.Apache.lucene.analysis.TokenStream;
import org.Apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.Apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
import org.Apache.lucene.analysis.standard.StandardFilter;
import org.Apache.lucene.analysis.standard.StandardTokenizer;
import org.Apache.lucene.document.Document;
import org.Apache.lucene.document.Field;
import org.Apache.lucene.index.CorruptIndexException;
import org.Apache.lucene.index.IndexReader;
import org.Apache.lucene.index.IndexWriter;
import org.Apache.lucene.index.Term;
import org.Apache.lucene.search.IndexSearcher;
import org.Apache.lucene.search.Query;
import org.Apache.lucene.search.ScoreDoc;
import org.Apache.lucene.search.Sort;
import org.Apache.lucene.search.TermQuery;
import org.Apache.lucene.search.TopDocs;
import org.Apache.lucene.search.spell.LuceneDictionary;
import org.Apache.lucene.store.Directory;
import org.Apache.lucene.store.FSDirectory;

/**
 * Search term auto-completer, works for single terms (so use on the last term
 * of the query).
 * <p>
 * Returns more popular terms first.
 * 
 * @author Mat Mannion, [email protected]
 */
public final class Autocompleter {

    private static final String GRAMMED_WORDS_FIELD = "words";

    private static final String SOURCE_Word_FIELD = "sourceWord";

    private static final String COUNT_FIELD = "count";

    private static final String[] ENGLISH_STOP_WORDS = {
    "a", "an", "and", "are", "as", "at", "be", "but", "by",
    "for", "i", "if", "in", "into", "is",
    "no", "not", "of", "on", "or", "s", "such",
    "t", "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with"
    };

    private final Directory autoCompleteDirectory;

    private IndexReader autoCompleteReader;

    private IndexSearcher autoCompleteSearcher;

    public Autocompleter(String autoCompleteDir) throws IOException {
        this.autoCompleteDirectory = FSDirectory.getDirectory(autoCompleteDir,
                null);

        reOpenReader();
    }

    public List<String> suggestTermsFor(String term) throws IOException {
        // get the top 5 terms for query
        Query query = new TermQuery(new Term(GRAMMED_WORDS_FIELD, term));
        Sort sort = new Sort(COUNT_FIELD, true);

        TopDocs docs = autoCompleteSearcher.search(query, null, 5, sort);
        List<String> suggestions = new ArrayList<String>();
        for (ScoreDoc doc : docs.scoreDocs) {
            suggestions.add(autoCompleteReader.document(doc.doc).get(
                    SOURCE_Word_FIELD));
        }

        return suggestions;
    }

    @SuppressWarnings("unchecked")
    public void reIndex(Directory sourceDirectory, String fieldToAutocomplete)
            throws CorruptIndexException, IOException {
        // build a dictionary (from the spell package)
        IndexReader sourceReader = IndexReader.open(sourceDirectory);

        LuceneDictionary dict = new LuceneDictionary(sourceReader,
                fieldToAutocomplete);

        // code from
        // org.Apache.lucene.search.spell.SpellChecker.indexDictionary(
        // Dictionary)
        IndexReader.unlock(autoCompleteDirectory);

        // use a custom analyzer so we can do EdgeNGramFiltering
        IndexWriter writer = new IndexWriter(autoCompleteDirectory,
        new Analyzer() {
            public TokenStream tokenStream(String fieldName,
                    Reader reader) {
                TokenStream result = new StandardTokenizer(reader);

                result = new StandardFilter(result);
                result = new LowerCaseFilter(result);
                result = new ISOLatin1AccentFilter(result);
                result = new StopFilter(result,
                    ENGLISH_STOP_WORDS);
                result = new EdgeNGramTokenFilter(
                    result, Side.FRONT,1, 20);

                return result;
            }
        }, true);

        writer.setMergeFactor(300);
        writer.setMaxBufferedDocs(150);

        // go through every Word, storing the original Word (incl. n-grams) 
        // and the number of times it occurs
        Map<String, Integer> wordsMap = new HashMap<String, Integer>();

        Iterator<String> iter = (Iterator<String>) dict.getWordsIterator();
        while (iter.hasNext()) {
            String Word = iter.next();

            int len = Word.length();
            if (len < 3) {
                continue; // too short we bail but "too long" is fine...
            }

            if (wordsMap.containsKey(Word)) {
                throw new IllegalStateException(
                        "This should never happen in Lucene 2.3.2");
                // wordsMap.put(Word, wordsMap.get(Word) + 1);
            } else {
                // use the number of documents this Word appears in
                wordsMap.put(Word, sourceReader.docFreq(new Term(
                        fieldToAutocomplete, Word)));
            }
        }

        for (String Word : wordsMap.keySet()) {
            // ok index the Word
            Document doc = new Document();
            doc.add(new Field(SOURCE_Word_FIELD, Word, Field.Store.YES,
                    Field.Index.UN_TOKENIZED)); // orig term
            doc.add(new Field(GRAMMED_WORDS_FIELD, Word, Field.Store.YES,
                    Field.Index.TOKENIZED)); // grammed
            doc.add(new Field(COUNT_FIELD,
                    Integer.toString(wordsMap.get(Word)), Field.Store.NO,
                    Field.Index.UN_TOKENIZED)); // count

            writer.addDocument(doc);
        }

        sourceReader.close();

        // close writer
        writer.optimize();
        writer.close();

        // re-open our reader
        reOpenReader();
    }

    private void reOpenReader() throws CorruptIndexException, IOException {
        if (autoCompleteReader == null) {
            autoCompleteReader = IndexReader.open(autoCompleteDirectory);
        } else {
            autoCompleteReader.reopen();
        }

        autoCompleteSearcher = new IndexSearcher(autoCompleteReader);
    }

    public static void main(String[] args) throws Exception {
        Autocompleter autocomplete = new Autocompleter("/index/autocomplete");

        // run this to re-index from the current index, shouldn't need to do
        // this very often
        // autocomplete.reIndex(FSDirectory.getDirectory("/index/live", null),
        // "content");

        String term = "steve";

        System.out.println(autocomplete.suggestTermsFor(term));
        // prints [steve, steven, stevens, stevenson, stevenage]
    }

}
37
Mat Mannion

Voici une translittération de l'implémentation de Mat en C # pour Lucene.NET, ainsi qu'un extrait pour câbler une zone de texte à l'aide de la fonctionnalité de saisie semi-automatique de jQuery.

<input id="search-input" name="query" placeholder="Search database." type="text" />

... Saisie semi-automatique JQuery:

// don't navigate away from the field when pressing tab on a selected item
$( "#search-input" ).keydown(function (event) {
    if (event.keyCode === $.ui.keyCode.TAB && $(this).data("autocomplete").menu.active) {
        event.preventDefault();
    }
});

$( "#search-input" ).autocomplete({
    source: '@Url.Action("SuggestTerms")', // <-- ASP.NET MVC Razor syntax
    minLength: 2,
    delay: 500,
    focus: function () {
        // prevent value inserted on focus
        return false;
    },
    select: function (event, ui) {
        var terms = this.value.split(/\s+/);
        terms.pop(); // remove dropdown item
        terms.Push(ui.item.value.trim()); // add completed item
        this.value = terms.join(" "); 
        return false;
    },
 });

... voici le code du contrôleur ASP.NET MVC:

    //
    // GET: /MyApp/SuggestTerms?term=something
    public JsonResult SuggestTerms(string term)
    {
        if (string.IsNullOrWhiteSpace(term))
            return Json(new string[] {});

        term = term.Split().Last();

        // Fetch suggestions
        string[] suggestions = SearchSvc.SuggestTermsFor(term).ToArray();

        return Json(suggestions, JsonRequestBehavior.AllowGet);
    }

... et voici le code de Mat en C #:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Store;
using Lucene.Net.Index;
using Lucene.Net.Search;
using SpellChecker.Net.Search.Spell;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.NGram;
using Lucene.Net.Documents;

namespace Cipher.Services
{
    /// <summary>
    /// Search term auto-completer, works for single terms (so use on the last term of the query).
    /// Returns more popular terms first.
    /// <br/>
    /// Author: Mat Mannion, [email protected]
    /// <seealso cref="http://stackoverflow.com/questions/120180/how-to-do-query-auto-completion-suggestions-in-lucene"/>
    /// </summary>
    /// 
    public class SearchAutoComplete {

        public int MaxResults { get; set; }

        private class AutoCompleteAnalyzer : Analyzer
        {
            public override TokenStream  TokenStream(string fieldName, System.IO.TextReader reader)
            {
                TokenStream result = new StandardTokenizer(kLuceneVersion, reader);

                result = new StandardFilter(result);
                result = new LowerCaseFilter(result);
                result = new ASCIIFoldingFilter(result);
                result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords));
                result = new EdgeNGramTokenFilter(
                    result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.DEFAULT_SIDE,1, 20);

                return result;
            }
        }

        private static readonly Lucene.Net.Util.Version kLuceneVersion = Lucene.Net.Util.Version.LUCENE_29;

        private static readonly String kGrammedWordsField = "words";

        private static readonly String kSourceWordField = "sourceWord";

        private static readonly String kCountField = "count";

        private static readonly String[] kEnglishStopWords = {
            "a", "an", "and", "are", "as", "at", "be", "but", "by",
            "for", "i", "if", "in", "into", "is",
            "no", "not", "of", "on", "or", "s", "such",
            "t", "that", "the", "their", "then", "there", "these",
            "they", "this", "to", "was", "will", "with"
        };

        private readonly Directory m_directory;

        private IndexReader m_reader;

        private IndexSearcher m_searcher;

        public SearchAutoComplete(string autoCompleteDir) : 
            this(FSDirectory.Open(new System.IO.DirectoryInfo(autoCompleteDir)))
        {
        }

        public SearchAutoComplete(Directory autoCompleteDir, int maxResults = 8) 
        {
            this.m_directory = autoCompleteDir;
            MaxResults = maxResults;

            ReplaceSearcher();
        }

        /// <summary>
        /// Find terms matching the given partial Word that appear in the highest number of documents.</summary>
        /// <param name="term">A Word or part of a Word</param>
        /// <returns>A list of suggested completions</returns>
        public IEnumerable<String> SuggestTermsFor(string term) 
        {
            if (m_searcher == null)
                return new string[] { };

            // get the top terms for query
            Query query = new TermQuery(new Term(kGrammedWordsField, term.ToLower()));
            Sort sort = new Sort(new SortField(kCountField, SortField.INT));

            TopDocs docs = m_searcher.Search(query, null, MaxResults, sort);
            string[] suggestions = docs.ScoreDocs.Select(doc => 
                m_reader.Document(doc.Doc).Get(kSourceWordField)).ToArray();

            return suggestions;
        }


        /// <summary>
        /// Open the index in the given directory and create a new index of Word frequency for the 
        /// given index.</summary>
        /// <param name="sourceDirectory">Directory containing the index to count words in.</param>
        /// <param name="fieldToAutocomplete">The field in the index that should be analyzed.</param>
        public void BuildAutoCompleteIndex(Directory sourceDirectory, String fieldToAutocomplete)
        {
            // build a dictionary (from the spell package)
            using (IndexReader sourceReader = IndexReader.Open(sourceDirectory, true))
            {
                LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete);

                // code from
                // org.Apache.lucene.search.spell.SpellChecker.indexDictionary(
                // Dictionary)
                //IndexWriter.Unlock(m_directory);

                // use a custom analyzer so we can do EdgeNGramFiltering
                var analyzer = new AutoCompleteAnalyzer();
                using (var writer = new IndexWriter(m_directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED))
                {
                    writer.MergeFactor = 300;
                    writer.SetMaxBufferedDocs(150);

                    // go through every Word, storing the original Word (incl. n-grams) 
                    // and the number of times it occurs
                    foreach (string Word in dict)
                    {
                        if (Word.Length < 3)
                            continue; // too short we bail but "too long" is fine...

                        // ok index the Word
                        // use the number of documents this Word appears in
                        int freq = sourceReader.DocFreq(new Term(fieldToAutocomplete, Word));
                        var doc = MakeDocument(fieldToAutocomplete, Word, freq);

                        writer.AddDocument(doc);
                    }

                    writer.Optimize();
                }

            }

            // re-open our reader
            ReplaceSearcher();
        }

        private static Document MakeDocument(String fieldToAutocomplete, string Word, int frequency)
        {
            var doc = new Document();
            doc.Add(new Field(kSourceWordField, Word, Field.Store.YES,
                    Field.Index.NOT_ANALYZED)); // orig term
            doc.Add(new Field(kGrammedWordsField, Word, Field.Store.YES,
                    Field.Index.ANALYZED)); // grammed
            doc.Add(new Field(kCountField,
                    frequency.ToString(), Field.Store.NO,
                    Field.Index.NOT_ANALYZED)); // count
            return doc;
        }

        private void ReplaceSearcher() 
        {
            if (IndexReader.IndexExists(m_directory))
            {
                if (m_reader == null)
                    m_reader = IndexReader.Open(m_directory, true);
                else
                    m_reader.Reopen();

                m_searcher = new IndexSearcher(m_reader);
            }
            else
            {
                m_searcher = null;
            }
        }


    }
}
26
ThisIsTheDave

mon code basé sur lucene 4.2 , peut vous aider

import Java.io.File;
import Java.io.IOException;

import org.Apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.Apache.lucene.index.DirectoryReader;
import org.Apache.lucene.index.IndexWriterConfig;
import org.Apache.lucene.index.IndexWriterConfig.OpenMode;
import org.Apache.lucene.search.spell.Dictionary;
import org.Apache.lucene.search.spell.LuceneDictionary;
import org.Apache.lucene.search.spell.PlainTextDictionary;
import org.Apache.lucene.search.spell.SpellChecker;
import org.Apache.lucene.store.Directory;
import org.Apache.lucene.store.FSDirectory;
import org.Apache.lucene.store.IOContext;
import org.Apache.lucene.store.RAMDirectory;
import org.Apache.lucene.util.Version;
import org.wltea4pinyin.analyzer.lucene.IKAnalyzer4PinYin;


/**
 * 
 * 
 * @author <a href="mailto:[email protected]"></a>
 * @version 2013-11-25上午11:13:59
 */
public class LuceneSpellCheckerDemoService {

private static final String INDEX_FILE = "/Users/r/Documents/jar/luke/youtui/index";
private static final String INDEX_FILE_SPELL = "/Users/r/Documents/jar/luke/spell";

private static final String INDEX_FIELD = "app_name_quanpin";

public static void main(String args[]) {

    try {
        //
        PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new IKAnalyzer4PinYin(
                true));

        //  read index conf
        IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_42, wrapper);
        conf.setOpenMode(OpenMode.CREATE_OR_APPEND);

        // read dictionary
        Directory directory = FSDirectory.open(new File(INDEX_FILE));
        RAMDirectory ramDir = new RAMDirectory(directory, IOContext.READ);
        DirectoryReader indexReader = DirectoryReader.open(ramDir);

        Dictionary dic = new LuceneDictionary(indexReader, INDEX_FIELD);


        SpellChecker sc = new SpellChecker(FSDirectory.open(new File(INDEX_FILE_SPELL)));
        //sc.indexDictionary(new PlainTextDictionary(new File("myfile.txt")), conf, false);
        sc.indexDictionary(dic, conf, true);
        String[] strs = sc.suggestSimilar("zhsiwusdazhanjiangshi", 10);
        for (int i = 0; i < strs.length; i++) {
            System.out.println(strs[i]);
        }
        sc.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}


}
5
user2098849

Vous pouvez utiliser la classe PrefixQuery sur un index "dictionnaire". La classe LuceneDictionary pourrait également être utile.

Jetez un œil à cet article lié ci-dessous. Il explique comment implémenter la fonctionnalité "Vouliez-vous dire?" disponible dans les moteurs de recherche modernes tels que Google. Vous n'avez peut-être pas besoin de quelque chose d'aussi complexe que décrit dans l'article. Cependant, l'article explique comment utiliser le package de sorts Lucene.

Une façon de construire un index "dictionnaire" serait d'itérer sur un LuceneDictionary.

J'espère que cela aide

Vouliez-vous dire: Lucene? (Page 1)

Vouliez-vous dire: Lucene? (Page 2)

Vouliez-vous dire: Lucene? (Page 3)

4
Alexandre Victoor

En plus de la publication ci-dessus (très appréciée) sur la conversion c: si vous utilisez .NET 3.5, vous devrez inclure le code du EdgeNGramTokenFilter - ou du moins je l'ai fait - en utilisant Lucene 2.9.2 - ce filtre est manquant de la version .NET pour autant que je sache. J'ai dû aller chercher la version .NET 4 en ligne dans 2.9.3 et le port arrière - j'espère que cela rend la procédure moins pénible pour quelqu'un ...

Modifier: veuillez également noter que le tableau renvoyé par la fonction SuggestTermsFor () est trié par nombre croissant, vous voudrez probablement l'inverser pour obtenir les termes les plus populaires en premier dans votre liste

using System.IO;
using System.Collections;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Util;

namespace Lucene.Net.Analysis.NGram
{

/**
 * Tokenizes the given token into n-grams of given size(s).
 * <p>
 * This {@link TokenFilter} create n-grams from the beginning Edge or ending Edge of a input token.
 * </p>
 */
public class EdgeNGramTokenFilter : TokenFilter
{
    public static Side DEFAULT_SIDE = Side.FRONT;
    public static int DEFAULT_MAX_GRAM_SIZE = 1;
    public static int DEFAULT_MIN_GRAM_SIZE = 1;

    // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
    /** Specifies which side of the input the n-gram should be generated from */
    public class Side
    {
        private string label;

        /** Get the n-gram from the front of the input */
        public static Side FRONT = new Side("front");

        /** Get the n-gram from the end of the input */
        public static Side BACK = new Side("back");

        // Private ctor
        private Side(string label) { this.label = label; }

        public string getLabel() { return label; }

        // Get the appropriate Side from a string
        public static Side getSide(string sideName)
        {
            if (FRONT.getLabel().Equals(sideName))
            {
                return FRONT;
            }
            else if (BACK.getLabel().Equals(sideName))
            {
                return BACK;
            }
            return null;
        }
    }

    private int minGram;
    private int maxGram;
    private Side side;
    private char[] curTermBuffer;
    private int curTermLength;
    private int curGramSize;
    private int tokStart;

    private TermAttribute termAtt;
    private OffsetAttribute offsetAtt;

    protected EdgeNGramTokenFilter(TokenStream input) : base(input)
    {
        this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
        this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
    }

    /**
     * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
     *
     * @param input {@link TokenStream} holding the input to be tokenized
     * @param side the {@link Side} from which to chop off an n-gram
     * @param minGram the smallest n-gram to generate
     * @param maxGram the largest n-gram to generate
     */
    public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram)
        : base(input)
    {

        if (side == null)
        {
            throw new System.ArgumentException("sideLabel must be either front or back");
        }

        if (minGram < 1)
        {
            throw new System.ArgumentException("minGram must be greater than zero");
        }

        if (minGram > maxGram)
        {
            throw new System.ArgumentException("minGram must not be greater than maxGram");
        }

        this.minGram = minGram;
        this.maxGram = maxGram;
        this.side = side;
        this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
        this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
    }

    /**
     * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
     *
     * @param input {@link TokenStream} holding the input to be tokenized
     * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
     * @param minGram the smallest n-gram to generate
     * @param maxGram the largest n-gram to generate
     */
    public EdgeNGramTokenFilter(TokenStream input, string sideLabel, int minGram, int maxGram)
        : this(input, Side.getSide(sideLabel), minGram, maxGram)
    {

    }

    public override bool IncrementToken()
    {
        while (true)
        {
            if (curTermBuffer == null)
            {
                if (!input.IncrementToken())
                {
                    return false;
                }
                else
                {
                    curTermBuffer = (char[])termAtt.TermBuffer().Clone();
                    curTermLength = termAtt.TermLength();
                    curGramSize = minGram;
                    tokStart = offsetAtt.StartOffset();
                }
            }
            if (curGramSize <= maxGram)
            {
                if (!(curGramSize > curTermLength         // if the remaining input is too short, we can't generate any n-grams
                    || curGramSize > maxGram))
                {       // if we have hit the end of our n-gram size range, quit
                    // grab gramSize chars from front or back
                    int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
                    int end = start + curGramSize;
                    ClearAttributes();
                    offsetAtt.SetOffset(tokStart + start, tokStart + end);
                    termAtt.SetTermBuffer(curTermBuffer, start, curGramSize);
                    curGramSize++;
                    return true;
                }
            }
            curTermBuffer = null;
        }
    }

    public override  Token Next(Token reusableToken)
    {
        return base.Next(reusableToken);
    }
    public override Token Next()
    {
        return base.Next();
    }
    public override void Reset()
    {
        base.Reset();
        curTermBuffer = null;
    }
}
}
4
megawatts