10 #ifndef _STRUS_ANALYZER_TEXT_PROCESSOR_INTERFACE_HPP_INCLUDED
11 #define _STRUS_ANALYZER_TEXT_PROCESSOR_INTERFACE_HPP_INCLUDED
21 class DocumentClassDetectorInterface;
23 class SegmenterInterface;
25 class NormalizerFunctionInterface;
27 class TokenizerFunctionInterface;
29 class TokenizerFunctionInstanceInterface;
31 class AggregatorFunctionInterface;
33 class PatternLexerInterface;
35 class PatternMatcherInterface;
37 class PatternMatcherProgramInterface;
39 class PatternTermFeederInterface;
41 class PosTaggerDataInterface;
43 class PosTaggerInterface;
45 class TokenMarkupInstanceInterface;
170 const char* ar[] = {
"Segmenter",
"Tokenizer",
"Normalizer",
"Aggregator",
"PatternLexer",
"PatternMatcher",0};
virtual void defineTokenizer(const std::string &name, TokenizerFunctionInterface *tokenizer)=0
Define a tokenizer by name.
Defines a description of the properties of an original document processed by the segmenter.
Definition: documentClass.hpp:21
virtual PosTaggerDataInterface * createPosTaggerData(TokenizerFunctionInstanceInterface *tokenizer) const =0
Create a data structure to feed with POS tagging info.
Addresses a document segmenter.
Definition: textProcessorInterface.hpp:161
virtual void defineAggregator(const std::string &name, AggregatorFunctionInterface *aggregator)=0
Define an aggregator function by name.
virtual void defineNormalizer(const std::string &name, NormalizerFunctionInterface *normalizer)=0
Define a normalizer by name.
virtual void definePatternLexer(const std::string &name, PatternLexerInterface *lexer)=0
Define a pattern matching lexer by name.
virtual const AggregatorFunctionInterface * getAggregator(const std::string &name) const =0
Get a const reference to a statistics collector function object that implements the collection of som...
virtual bool detectDocumentClass(analyzer::DocumentClass &dclass, const char *contentBegin, std::size_t contentBeginSize, bool isComplete) const =0
Detect the document class from a document start chunk and set the content description attributes...
Interface for the aggregator function constructor.
Definition: aggregatorFunctionInterface.hpp:24
Interface for creating an automaton for detecting patterns of tokens in a document stream...
Definition: patternMatcherInterface.hpp:21
Options to stear the segmenter behaviour or the structure of input if not self defined in the documen...
Definition: segmenterOptions.hpp:25
Interface for a tokenizer function.
Definition: tokenizerFunctionInterface.hpp:25
virtual const SegmenterInterface * getSegmenterByName(const std::string &segmenterName) const =0
Get a document segmenter object reference.
Structure describing the MIME type plus some attributes that could be relevant for analysis of a docu...
virtual void defineSegmenterOptions(const std::string &scheme, const analyzer::SegmenterOptions &options)=0
Define segmenter optione by document scheme identifier.
virtual const PatternTermFeederInterface * getPatternTermFeeder() const =0
Get the default pattern term feeder interface for post processing pattern matching on analyzer output...
virtual const TokenizerFunctionInterface * getTokenizer(const std::string &name) const =0
Get a const reference to a tokenizer object that implements the splitting of a text segments into tok...
Interface for instantiating the data structure of an automaton for detecting lexems used as basic ent...
Definition: patternTermFeederInterface.hpp:21
Interface for the data built by a POS tagger.
Definition: posTaggerDataInterface.hpp:24
Interface for tokenization.
Definition: tokenizerFunctionInstanceInterface.hpp:21
Defines a detector that returns a content description for a document content it recognizes.
Definition: documentClassDetectorInterface.hpp:21
Interface for instantiating the data structure of an automaton for detecting lexems used as basic ent...
Definition: patternLexerInterface.hpp:21
virtual void defineSegmenter(const std::string &name, SegmenterInterface *segmenter)=0
Define a document segmenter by name.
virtual analyzer::SegmenterOptions getSegmenterOptions(const std::string &scheme) const =0
Get the options for a document segmenter for a specific document type.
virtual const PatternMatcherInterface * getPatternMatcher(const std::string &name) const =0
Get a const reference to a pattern lexer.
virtual std::string getResourceFilePath(const std::string &filename) const =0
Get the absolute path of a resource file.
Interface for building the automaton for detecting patterns of tokens in a document stream...
Definition: tokenMarkupInstanceInterface.hpp:24
Addresses a normalizer.
Definition: textProcessorInterface.hpp:163
virtual const NormalizerFunctionInterface * getNormalizer(const std::string &name) const =0
Get a const reference to a normalizer object that implements the transformation of a token into a ter...
Addresses an aggregator.
Definition: textProcessorInterface.hpp:164
virtual void definePatternMatcher(const std::string &name, PatternMatcherInterface *matcher)=0
Define a pattern matcher by name.
virtual std::vector< std::string > getFunctionList(const FunctionType &type) const =0
Get a list of all functions of a specific type available.
static const char * functionTypeName(FunctionType t)
Definition: textProcessorInterface.hpp:168
Interface for the normalizer constructor.
Definition: normalizerFunctionInterface.hpp:26
virtual void defineDocumentClassDetector(DocumentClassDetectorInterface *detector)=0
Define a content detector.
virtual const SegmenterInterface * getSegmenterByMimeType(const std::string &mimetype) const =0
Get a document segmenter object reference that is able to process the specified MIME type...
virtual const PosTaggerInterface * getPosTagger() const =0
Get the default POS tagger interface to do POS tagging of documents.
Interface for the construction of a POS tagger instance for a specified segmenter.
Definition: posTaggerInterface.hpp:25
Addresses a pattern matcher.
Definition: textProcessorInterface.hpp:166
virtual TokenMarkupInstanceInterface * createTokenMarkupInstance() const =0
Create an interface for markup of content.
virtual ~TextProcessorInterface()
Desructor.
Definition: textProcessorInterface.hpp:53
Options to stear the segmenter behaviour or the structure of input if not self defined in the documen...
Addresses a pattern lexer.
Definition: textProcessorInterface.hpp:165
Addresses a tokenizer.
Definition: textProcessorInterface.hpp:162
FunctionType
Function type for fetching descriptions of available functions.
Definition: textProcessorInterface.hpp:159
Defines an interface for creating instances of programs for document segmentation.
Definition: segmenterInterface.hpp:29
Interface for the object providing tokenizers and normalizers used for creating terms from segments o...
Definition: textProcessorInterface.hpp:49
virtual const PatternLexerInterface * getPatternLexer(const std::string &name) const =0
Get a const reference to a pattern lexer.