10 #ifndef _STRUS_ANALYZER_DOCUMENT_ANALYZER_INTERFACE_HPP_INCLUDED
11 #define _STRUS_ANALYZER_DOCUMENT_ANALYZER_INTERFACE_HPP_INCLUDED
26 class DocumentAnalyzerContextInterface;
28 class NormalizerFunctionInstanceInterface;
30 class TokenizerFunctionInstanceInterface;
32 class AggregatorFunctionInstanceInterface;
34 class SegmenterInterface;
36 class PatternTermFeederInstanceInterface;
38 class PatternMatcherInstanceInterface;
40 class PatternLexerInstanceInterface;
57 const std::string& type,
58 const std::string& selectexpr,
60 const std::vector<NormalizerFunctionInstanceInterface*>& normalizers,
72 const std::string& type,
73 const std::string& selectexpr,
75 const std::vector<NormalizerFunctionInstanceInterface*>& normalizers,
86 const std::string& metaname,
87 const std::string& selectexpr,
89 const std::vector<NormalizerFunctionInstanceInterface*>& normalizers)=0;
96 const std::string& metaname,
106 const std::string& attribname,
107 const std::string& selectexpr,
109 const std::vector<NormalizerFunctionInstanceInterface*>& normalizers)=0;
116 const std::string& subDocumentTypeName,
117 const std::string& selectexpr)=0;
124 const std::string& selectexpr,
134 const std::string& termtype,
135 const std::string& selectexpr,
137 const std::vector<NormalizerFunctionInstanceInterface*>& normalizers,
145 const std::string& patternName,
155 const std::string& patternName,
158 const std::vector<std::string>& selectexpr)=0;
167 const std::string& type,
168 const std::string& patternName,
169 const std::vector<NormalizerFunctionInstanceInterface*>& normalizers,
180 const std::string& type,
181 const std::string& patternName,
182 const std::vector<NormalizerFunctionInstanceInterface*>& normalizers,
192 const std::string& metaname,
193 const std::string& patternName,
194 const std::vector<NormalizerFunctionInstanceInterface*>& normalizers)=0;
202 const std::string& attribname,
203 const std::string& patternName,
204 const std::vector<NormalizerFunctionInstanceInterface*>& normalizers)=0;
212 const std::string& content,
virtual analyzer::Document analyze(const std::string &content, const analyzer::DocumentClass &dclass) const =0
Segment and tokenize a document, assign types to tokens and metadata and normalize their values...
virtual void addSearchIndexFeature(const std::string &type, const std::string &selectexpr, TokenizerFunctionInstanceInterface *tokenizer, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
Declare a feature to be put into the search index.
virtual void addSearchIndexFeatureFromPatternMatch(const std::string &type, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
Declare a feature to be put into the search index derived from a pattern matcher result item...
Defines a description of the properties of an original document processed by the segmenter.
Definition: documentClass.hpp:21
Instance interface for defining a mapping of terms of the document analysis outout as lexems used as ...
Definition: patternTermFeederInstanceInterface.hpp:20
Structure describing the internal representation of a document analyzer for introspection.
Defines the context for analyzing multi part documents, iterating on the sub documents defined...
Definition: documentAnalyzerContextInterface.hpp:21
Options to stear the creation of terms in the analyzer.
Definition: featureOptions.hpp:21
Options to stear the position assignment of features.
virtual void defineSubDocument(const std::string &subDocumentTypeName, const std::string &selectexpr)=0
Declare a sub document for the handling of multi part documents in an analyzed content.
virtual void defineSubContent(const std::string &selectexpr, const analyzer::DocumentClass &documentClass)=0
Declare a sub content of the document that has to be processed with a different segmenter.
Structure describing the MIME type plus some attributes that could be relevant for analysis of a docu...
Interface for building the automaton for detecting patterns in text.
Definition: patternMatcherInstanceInterface.hpp:22
Interface for building the automaton for detecting lexems used as basic entities by pattern matching ...
Definition: patternLexerInstanceInterface.hpp:23
Interface for tokenization.
Definition: tokenizerFunctionInstanceInterface.hpp:21
virtual void defineAttribute(const std::string &attribname, const std::string &selectexpr, TokenizerFunctionInstanceInterface *tokenizer, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers)=0
Declare a feature to be defined as document attribute used for summarization (document title...
virtual void defineAttributeFromPatternMatch(const std::string &attribname, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers)=0
Declare a feature to be defined as document attribute used for summarization, derived from a pattern ...
virtual void addForwardIndexFeatureFromPatternMatch(const std::string &type, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
Declare a feature to be put into the forward index derived from a pattern matcher result item...
virtual void addPatternLexem(const std::string &termtype, const std::string &selectexpr, TokenizerFunctionInstanceInterface *tokenizer, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority)=0
Declare an element to be used as lexem by post processing pattern matching but not put into the resul...
virtual void addForwardIndexFeature(const std::string &type, const std::string &selectexpr, TokenizerFunctionInstanceInterface *tokenizer, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
Declare a feature to be put into the forward index used for summarization extraction.
virtual void defineAggregatedMetaData(const std::string &metaname, AggregatorFunctionInstanceInterface *statfunc)=0
Declare some collected statistics of the document to be put into the meta data table used for restric...
Defines a program for analyzing a document, splitting it into normalized terms that can be fed to the...
Definition: documentAnalyzerInstanceInterface.hpp:43
Structure describing the internal representation of a document analyzer for introspection.
Definition: documentAnalyzerView.hpp:26
virtual ~DocumentAnalyzerInstanceInterface()
Destructor.
Definition: documentAnalyzerInstanceInterface.hpp:47
virtual analyzer::DocumentAnalyzerView view() const =0
Return a structure with all definitions for introspection.
Structure of a document as result of document analysis.
Options to stear the segmenter behaviour or the structure of input if not self defined in the documen...
virtual void defineMetaData(const std::string &metaname, const std::string &selectexpr, TokenizerFunctionInstanceInterface *tokenizer, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers)=0
Declare a feature to be put into the meta data table used for restrictions, weighting and summarizati...
Interface for a parameterized aggregator function.
Definition: aggregatorFunctionInstanceInterface.hpp:22
virtual DocumentAnalyzerContextInterface * createContext(const analyzer::DocumentClass &dclass) const =0
Create the context used for analyzing multipart or very big documents.
virtual void defineMetaDataFromPatternMatch(const std::string &metaname, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers)=0
Declare a feature to be put into the meta data table for restrictions, weighting and summarization...
virtual void defineContentPatternMatcher(const std::string &patternName, PatternMatcherInstanceInterface *matcher, PatternLexerInstanceInterface *lexer, const std::vector< std::string > &selectexpr)=0
Declare a pattern matcher on the document features after other document analysis. ...
Structure of a document created as result of a document analysis.
Definition: document.hpp:25
virtual void defineTokenPatternMatcher(const std::string &patternName, PatternMatcherInstanceInterface *matcher, PatternTermFeederInstanceInterface *feeder)=0
Declare a pattern matcher on the document terms of pattern lexems after normalization.