Interface for the object providing tokenizers and normalizers used for creating terms from segments of text and functions for collecting overall document statistics.
More...
|
virtual | ~TextProcessorInterface () |
| Desructor. More...
|
|
virtual std::string | getResourceFilePath (const std::string &filename) const =0 |
| Get the absolute path of a resource file. More...
|
|
virtual const SegmenterInterface * | getSegmenterByName (const std::string &segmenterName) const =0 |
| Get a document segmenter object reference. More...
|
|
virtual const SegmenterInterface * | getSegmenterByMimeType (const std::string &mimetype) const =0 |
| Get a document segmenter object reference that is able to process the specified MIME type. More...
|
|
virtual analyzer::SegmenterOptions | getSegmenterOptions (const std::string &scheme) const =0 |
| Get the options for a document segmenter for a specific document type. More...
|
|
virtual const
TokenizerFunctionInterface * | getTokenizer (const std::string &name) const =0 |
| Get a const reference to a tokenizer object that implements the splitting of a text segments into tokens. More...
|
|
virtual const
NormalizerFunctionInterface * | getNormalizer (const std::string &name) const =0 |
| Get a const reference to a normalizer object that implements the transformation of a token into a term string. More...
|
|
virtual const
AggregatorFunctionInterface * | getAggregator (const std::string &name) const =0 |
| Get a const reference to a statistics collector function object that implements the collection of some counting of document parts. More...
|
|
virtual const
PatternLexerInterface * | getPatternLexer (const std::string &name) const =0 |
| Get a const reference to a pattern lexer. More...
|
|
virtual const
PatternMatcherInterface * | getPatternMatcher (const std::string &name) const =0 |
| Get a const reference to a pattern lexer. More...
|
|
virtual const
PatternTermFeederInterface * | getPatternTermFeeder () const =0 |
| Get the default pattern term feeder interface for post processing pattern matching on analyzer output. More...
|
|
virtual PosTaggerDataInterface * | createPosTaggerData (TokenizerFunctionInstanceInterface *tokenizer) const =0 |
| Create a data structure to feed with POS tagging info. More...
|
|
virtual const PosTaggerInterface * | getPosTagger () const =0 |
| Get the default POS tagger interface to do POS tagging of documents. More...
|
|
virtual
TokenMarkupInstanceInterface * | createTokenMarkupInstance () const =0 |
| Create an interface for markup of content. More...
|
|
virtual bool | detectDocumentClass (analyzer::DocumentClass &dclass, const char *contentBegin, std::size_t contentBeginSize, bool isComplete) const =0 |
| Detect the document class from a document start chunk and set the content description attributes. More...
|
|
virtual void | defineDocumentClassDetector (DocumentClassDetectorInterface *detector)=0 |
| Define a content detector. More...
|
|
virtual void | defineSegmenter (const std::string &name, SegmenterInterface *segmenter)=0 |
| Define a document segmenter by name. More...
|
|
virtual void | defineSegmenterOptions (const std::string &scheme, const analyzer::SegmenterOptions &options)=0 |
| Define segmenter optione by document scheme identifier. More...
|
|
virtual void | defineTokenizer (const std::string &name, TokenizerFunctionInterface *tokenizer)=0 |
| Define a tokenizer by name. More...
|
|
virtual void | defineNormalizer (const std::string &name, NormalizerFunctionInterface *normalizer)=0 |
| Define a normalizer by name. More...
|
|
virtual void | defineAggregator (const std::string &name, AggregatorFunctionInterface *aggregator)=0 |
| Define an aggregator function by name. More...
|
|
virtual void | definePatternLexer (const std::string &name, PatternLexerInterface *lexer)=0 |
| Define a pattern matching lexer by name. More...
|
|
virtual void | definePatternMatcher (const std::string &name, PatternMatcherInterface *matcher)=0 |
| Define a pattern matcher by name. More...
|
|
virtual std::vector< std::string > | getFunctionList (const FunctionType &type) const =0 |
| Get a list of all functions of a specific type available. More...
|
|
Interface for the object providing tokenizers and normalizers used for creating terms from segments of text and functions for collecting overall document statistics.