strusAnalyzer  0.17
documentAnalyzerInstanceInterface.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014 Patrick P. Frey
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7  */
10 #ifndef _STRUS_ANALYZER_DOCUMENT_ANALYZER_INTERFACE_HPP_INCLUDED
11 #define _STRUS_ANALYZER_DOCUMENT_ANALYZER_INTERFACE_HPP_INCLUDED
18 #include <vector>
19 #include <string>
20 
22 namespace strus
23 {
24 
26 class DocumentAnalyzerContextInterface;
28 class NormalizerFunctionInstanceInterface;
30 class TokenizerFunctionInstanceInterface;
32 class AggregatorFunctionInstanceInterface;
34 class SegmenterInterface;
36 class PatternTermFeederInstanceInterface;
38 class PatternMatcherInstanceInterface;
40 class PatternLexerInstanceInterface;
41 
44 {
45 public:
48 
56  virtual void addSearchIndexFeature(
57  const std::string& type,
58  const std::string& selectexpr,
60  const std::vector<NormalizerFunctionInstanceInterface*>& normalizers,
61  int priority,
62  const analyzer::FeatureOptions& options)=0;
63 
71  virtual void addForwardIndexFeature(
72  const std::string& type,
73  const std::string& selectexpr,
75  const std::vector<NormalizerFunctionInstanceInterface*>& normalizers,
76  int priority,
77  const analyzer::FeatureOptions& options)=0;
78 
85  virtual void defineMetaData(
86  const std::string& metaname,
87  const std::string& selectexpr,
89  const std::vector<NormalizerFunctionInstanceInterface*>& normalizers)=0;
90 
95  virtual void defineAggregatedMetaData(
96  const std::string& metaname,
98 
105  virtual void defineAttribute(
106  const std::string& attribname,
107  const std::string& selectexpr,
109  const std::vector<NormalizerFunctionInstanceInterface*>& normalizers)=0;
110 
115  virtual void defineSubDocument(
116  const std::string& subDocumentTypeName,
117  const std::string& selectexpr)=0;
118 
123  virtual void defineSubContent(
124  const std::string& selectexpr,
125  const analyzer::DocumentClass& documentClass)=0;
126 
133  virtual void addPatternLexem(
134  const std::string& termtype,
135  const std::string& selectexpr,
137  const std::vector<NormalizerFunctionInstanceInterface*>& normalizers,
138  int priority)=0;
139 
144  virtual void defineTokenPatternMatcher(
145  const std::string& patternName,
148 
154  virtual void defineContentPatternMatcher(
155  const std::string& patternName,
158  const std::vector<std::string>& selectexpr)=0;
159 
167  const std::string& type,
168  const std::string& patternName,
169  const std::vector<NormalizerFunctionInstanceInterface*>& normalizers,
170  int priority,
171  const analyzer::FeatureOptions& options)=0;
172 
180  const std::string& type,
181  const std::string& patternName,
182  const std::vector<NormalizerFunctionInstanceInterface*>& normalizers,
183  int priority,
184  const analyzer::FeatureOptions& options)=0;
185 
191  virtual void defineMetaDataFromPatternMatch(
192  const std::string& metaname,
193  const std::string& patternName,
194  const std::vector<NormalizerFunctionInstanceInterface*>& normalizers)=0;
195 
201  virtual void defineAttributeFromPatternMatch(
202  const std::string& attribname,
203  const std::string& patternName,
204  const std::vector<NormalizerFunctionInstanceInterface*>& normalizers)=0;
205 
211  virtual analyzer::Document analyze(
212  const std::string& content,
213  const analyzer::DocumentClass& dclass) const=0;
214 
219  const analyzer::DocumentClass& dclass) const=0;
220 
223  virtual analyzer::DocumentAnalyzerView view() const=0;
224 };
225 
226 }//namespace
227 #endif
228 
virtual analyzer::Document analyze(const std::string &content, const analyzer::DocumentClass &dclass) const =0
Segment and tokenize a document, assign types to tokens and metadata and normalize their values...
virtual void addSearchIndexFeature(const std::string &type, const std::string &selectexpr, TokenizerFunctionInstanceInterface *tokenizer, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
Declare a feature to be put into the search index.
virtual void addSearchIndexFeatureFromPatternMatch(const std::string &type, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
Declare a feature to be put into the search index derived from a pattern matcher result item...
Defines a description of the properties of an original document processed by the segmenter.
Definition: documentClass.hpp:21
Instance interface for defining a mapping of terms of the document analysis outout as lexems used as ...
Definition: patternTermFeederInstanceInterface.hpp:20
Structure describing the internal representation of a document analyzer for introspection.
Defines the context for analyzing multi part documents, iterating on the sub documents defined...
Definition: documentAnalyzerContextInterface.hpp:21
Options to stear the creation of terms in the analyzer.
Definition: featureOptions.hpp:21
Options to stear the position assignment of features.
virtual void defineSubDocument(const std::string &subDocumentTypeName, const std::string &selectexpr)=0
Declare a sub document for the handling of multi part documents in an analyzed content.
virtual void defineSubContent(const std::string &selectexpr, const analyzer::DocumentClass &documentClass)=0
Declare a sub content of the document that has to be processed with a different segmenter.
Structure describing the MIME type plus some attributes that could be relevant for analysis of a docu...
Interface for building the automaton for detecting patterns in text.
Definition: patternMatcherInstanceInterface.hpp:22
Interface for building the automaton for detecting lexems used as basic entities by pattern matching ...
Definition: patternLexerInstanceInterface.hpp:23
Interface for tokenization.
Definition: tokenizerFunctionInstanceInterface.hpp:21
virtual void defineAttribute(const std::string &attribname, const std::string &selectexpr, TokenizerFunctionInstanceInterface *tokenizer, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers)=0
Declare a feature to be defined as document attribute used for summarization (document title...
virtual void defineAttributeFromPatternMatch(const std::string &attribname, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers)=0
Declare a feature to be defined as document attribute used for summarization, derived from a pattern ...
virtual void addForwardIndexFeatureFromPatternMatch(const std::string &type, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
Declare a feature to be put into the forward index derived from a pattern matcher result item...
virtual void addPatternLexem(const std::string &termtype, const std::string &selectexpr, TokenizerFunctionInstanceInterface *tokenizer, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority)=0
Declare an element to be used as lexem by post processing pattern matching but not put into the resul...
virtual void addForwardIndexFeature(const std::string &type, const std::string &selectexpr, TokenizerFunctionInstanceInterface *tokenizer, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
Declare a feature to be put into the forward index used for summarization extraction.
virtual void defineAggregatedMetaData(const std::string &metaname, AggregatorFunctionInstanceInterface *statfunc)=0
Declare some collected statistics of the document to be put into the meta data table used for restric...
Defines a program for analyzing a document, splitting it into normalized terms that can be fed to the...
Definition: documentAnalyzerInstanceInterface.hpp:43
Structure describing the internal representation of a document analyzer for introspection.
Definition: documentAnalyzerView.hpp:26
virtual ~DocumentAnalyzerInstanceInterface()
Destructor.
Definition: documentAnalyzerInstanceInterface.hpp:47
virtual analyzer::DocumentAnalyzerView view() const =0
Return a structure with all definitions for introspection.
Structure of a document as result of document analysis.
Options to stear the segmenter behaviour or the structure of input if not self defined in the documen...
virtual void defineMetaData(const std::string &metaname, const std::string &selectexpr, TokenizerFunctionInstanceInterface *tokenizer, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers)=0
Declare a feature to be put into the meta data table used for restrictions, weighting and summarizati...
Interface for a parameterized aggregator function.
Definition: aggregatorFunctionInstanceInterface.hpp:22
virtual DocumentAnalyzerContextInterface * createContext(const analyzer::DocumentClass &dclass) const =0
Create the context used for analyzing multipart or very big documents.
virtual void defineMetaDataFromPatternMatch(const std::string &metaname, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers)=0
Declare a feature to be put into the meta data table for restrictions, weighting and summarization...
virtual void defineContentPatternMatcher(const std::string &patternName, PatternMatcherInstanceInterface *matcher, PatternLexerInstanceInterface *lexer, const std::vector< std::string > &selectexpr)=0
Declare a pattern matcher on the document features after other document analysis. ...
Structure of a document created as result of a document analysis.
Definition: document.hpp:25
virtual void defineTokenPatternMatcher(const std::string &patternName, PatternMatcherInstanceInterface *matcher, PatternTermFeederInstanceInterface *feeder)=0
Declare a pattern matcher on the document terms of pattern lexems after normalization.