Interface for the object providing tokenizers and normalizers used for creating terms from segments of text and functions for collecting overall document statistics.
More...
|
| virtual | ~TextProcessorInterface () |
| | Desructor. More...
|
| |
| virtual std::string | getResourceFilePath (const std::string &filename) const =0 |
| | Get the absolute path of a resource file. More...
|
| |
| virtual const SegmenterInterface * | getSegmenterByName (const std::string &segmenterName) const =0 |
| | Get a document segmenter object reference. More...
|
| |
| virtual const SegmenterInterface * | getSegmenterByMimeType (const std::string &mimetype) const =0 |
| | Get a document segmenter object reference that is able to process the specified MIME type. More...
|
| |
| virtual analyzer::SegmenterOptions | getSegmenterOptions (const std::string &scheme) const =0 |
| | Get the options for a document segmenter for a specific document type. More...
|
| |
virtual const
TokenizerFunctionInterface * | getTokenizer (const std::string &name) const =0 |
| | Get a const reference to a tokenizer object that implements the splitting of a text segments into tokens. More...
|
| |
virtual const
NormalizerFunctionInterface * | getNormalizer (const std::string &name) const =0 |
| | Get a const reference to a normalizer object that implements the transformation of a token into a term string. More...
|
| |
virtual const
AggregatorFunctionInterface * | getAggregator (const std::string &name) const =0 |
| | Get a const reference to a statistics collector function object that implements the collection of some counting of document parts. More...
|
| |
virtual const
PatternLexerInterface * | getPatternLexer (const std::string &name) const =0 |
| | Get a const reference to a pattern lexer. More...
|
| |
virtual const
PatternMatcherInterface * | getPatternMatcher (const std::string &name) const =0 |
| | Get a const reference to a pattern lexer. More...
|
| |
virtual const
PatternTermFeederInterface * | getPatternTermFeeder () const =0 |
| | Get the default pattern term feeder interface for post processing pattern matching on analyzer output. More...
|
| |
| virtual PosTaggerDataInterface * | createPosTaggerData (TokenizerFunctionInstanceInterface *tokenizer) const =0 |
| | Create a data structure to feed with POS tagging info. More...
|
| |
| virtual const PosTaggerInterface * | getPosTagger () const =0 |
| | Get the default POS tagger interface to do POS tagging of documents. More...
|
| |
virtual
TokenMarkupInstanceInterface * | createTokenMarkupInstance () const =0 |
| | Create an interface for markup of content. More...
|
| |
| virtual bool | detectDocumentClass (analyzer::DocumentClass &dclass, const char *contentBegin, std::size_t contentBeginSize, bool isComplete) const =0 |
| | Detect the document class from a document start chunk and set the content description attributes. More...
|
| |
| virtual void | defineDocumentClassDetector (DocumentClassDetectorInterface *detector)=0 |
| | Define a content detector. More...
|
| |
| virtual void | defineSegmenter (const std::string &name, SegmenterInterface *segmenter)=0 |
| | Define a document segmenter by name. More...
|
| |
| virtual void | defineSegmenterOptions (const std::string &scheme, const analyzer::SegmenterOptions &options)=0 |
| | Define segmenter optione by document scheme identifier. More...
|
| |
| virtual void | defineTokenizer (const std::string &name, TokenizerFunctionInterface *tokenizer)=0 |
| | Define a tokenizer by name. More...
|
| |
| virtual void | defineNormalizer (const std::string &name, NormalizerFunctionInterface *normalizer)=0 |
| | Define a normalizer by name. More...
|
| |
| virtual void | defineAggregator (const std::string &name, AggregatorFunctionInterface *aggregator)=0 |
| | Define an aggregator function by name. More...
|
| |
| virtual void | definePatternLexer (const std::string &name, PatternLexerInterface *lexer)=0 |
| | Define a pattern matching lexer by name. More...
|
| |
| virtual void | definePatternMatcher (const std::string &name, PatternMatcherInterface *matcher)=0 |
| | Define a pattern matcher by name. More...
|
| |
| virtual std::vector< std::string > | getFunctionList (const FunctionType &type) const =0 |
| | Get a list of all functions of a specific type available. More...
|
| |
Interface for the object providing tokenizers and normalizers used for creating terms from segments of text and functions for collecting overall document statistics.