strusAnalyzer  0.17
textProcessorInterface.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014 Patrick P. Frey
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7  */
10 #ifndef _STRUS_ANALYZER_TEXT_PROCESSOR_INTERFACE_HPP_INCLUDED
11 #define _STRUS_ANALYZER_TEXT_PROCESSOR_INTERFACE_HPP_INCLUDED
14 #include <string>
15 #include <vector>
16 
18 namespace strus
19 {
21 class DocumentClassDetectorInterface;
23 class SegmenterInterface;
25 class NormalizerFunctionInterface;
27 class TokenizerFunctionInterface;
29 class TokenizerFunctionInstanceInterface;
31 class AggregatorFunctionInterface;
33 class PatternLexerInterface;
35 class PatternMatcherInterface;
37 class PatternMatcherProgramInterface;
39 class PatternTermFeederInterface;
41 class PosTaggerDataInterface;
43 class PosTaggerInterface;
45 class TokenMarkupInstanceInterface;
46 
50 {
51 public:
54 
57  virtual std::string getResourceFilePath( const std::string& filename) const=0;
58 
62  virtual const SegmenterInterface* getSegmenterByName( const std::string& segmenterName) const=0;
63 
67  virtual const SegmenterInterface* getSegmenterByMimeType( const std::string& mimetype) const=0;
68 
71  virtual analyzer::SegmenterOptions getSegmenterOptions( const std::string& scheme) const=0;
72 
75  virtual const TokenizerFunctionInterface* getTokenizer( const std::string& name) const=0;
76 
79  virtual const NormalizerFunctionInterface* getNormalizer( const std::string& name) const=0;
80 
83  virtual const AggregatorFunctionInterface* getAggregator( const std::string& name) const=0;
84 
87  virtual const PatternLexerInterface* getPatternLexer( const std::string& name) const=0;
88 
91  virtual const PatternMatcherInterface* getPatternMatcher( const std::string& name) const=0;
92 
95  virtual const PatternTermFeederInterface* getPatternTermFeeder() const=0;
96 
102 
105  virtual const PosTaggerInterface* getPosTagger() const=0;
106 
110 
117  virtual bool detectDocumentClass( analyzer::DocumentClass& dclass, const char* contentBegin, std::size_t contentBeginSize, bool isComplete) const=0;
118 
122 
126  virtual void defineSegmenter( const std::string& name, SegmenterInterface* segmenter)=0;
127 
131  virtual void defineSegmenterOptions( const std::string& scheme, const analyzer::SegmenterOptions& options)=0;
132 
136  virtual void defineTokenizer( const std::string& name, TokenizerFunctionInterface* tokenizer)=0;
137 
141  virtual void defineNormalizer( const std::string& name, NormalizerFunctionInterface* normalizer)=0;
142 
146  virtual void defineAggregator( const std::string& name, AggregatorFunctionInterface* aggregator)=0;
147 
151  virtual void definePatternLexer( const std::string& name, PatternLexerInterface* lexer)=0;
152 
156  virtual void definePatternMatcher( const std::string& name, PatternMatcherInterface* matcher)=0;
157 
160  {
167  };
168  static const char* functionTypeName( FunctionType t)
169  {
170  const char* ar[] = {"Segmenter","Tokenizer","Normalizer","Aggregator","PatternLexer","PatternMatcher",0};
171  return ar[t];
172  }
173 
177  virtual std::vector<std::string> getFunctionList( const FunctionType& type) const=0;
178 };
179 
180 }//namespace
181 #endif
182 
virtual void defineTokenizer(const std::string &name, TokenizerFunctionInterface *tokenizer)=0
Define a tokenizer by name.
Defines a description of the properties of an original document processed by the segmenter.
Definition: documentClass.hpp:21
virtual PosTaggerDataInterface * createPosTaggerData(TokenizerFunctionInstanceInterface *tokenizer) const =0
Create a data structure to feed with POS tagging info.
Addresses a document segmenter.
Definition: textProcessorInterface.hpp:161
virtual void defineAggregator(const std::string &name, AggregatorFunctionInterface *aggregator)=0
Define an aggregator function by name.
virtual void defineNormalizer(const std::string &name, NormalizerFunctionInterface *normalizer)=0
Define a normalizer by name.
virtual void definePatternLexer(const std::string &name, PatternLexerInterface *lexer)=0
Define a pattern matching lexer by name.
virtual const AggregatorFunctionInterface * getAggregator(const std::string &name) const =0
Get a const reference to a statistics collector function object that implements the collection of som...
virtual bool detectDocumentClass(analyzer::DocumentClass &dclass, const char *contentBegin, std::size_t contentBeginSize, bool isComplete) const =0
Detect the document class from a document start chunk and set the content description attributes...
Interface for the aggregator function constructor.
Definition: aggregatorFunctionInterface.hpp:24
Interface for creating an automaton for detecting patterns of tokens in a document stream...
Definition: patternMatcherInterface.hpp:21
Options to stear the segmenter behaviour or the structure of input if not self defined in the documen...
Definition: segmenterOptions.hpp:25
Interface for a tokenizer function.
Definition: tokenizerFunctionInterface.hpp:25
virtual const SegmenterInterface * getSegmenterByName(const std::string &segmenterName) const =0
Get a document segmenter object reference.
Structure describing the MIME type plus some attributes that could be relevant for analysis of a docu...
virtual void defineSegmenterOptions(const std::string &scheme, const analyzer::SegmenterOptions &options)=0
Define segmenter optione by document scheme identifier.
virtual const PatternTermFeederInterface * getPatternTermFeeder() const =0
Get the default pattern term feeder interface for post processing pattern matching on analyzer output...
virtual const TokenizerFunctionInterface * getTokenizer(const std::string &name) const =0
Get a const reference to a tokenizer object that implements the splitting of a text segments into tok...
Interface for instantiating the data structure of an automaton for detecting lexems used as basic ent...
Definition: patternTermFeederInterface.hpp:21
Interface for the data built by a POS tagger.
Definition: posTaggerDataInterface.hpp:24
Interface for tokenization.
Definition: tokenizerFunctionInstanceInterface.hpp:21
Defines a detector that returns a content description for a document content it recognizes.
Definition: documentClassDetectorInterface.hpp:21
Interface for instantiating the data structure of an automaton for detecting lexems used as basic ent...
Definition: patternLexerInterface.hpp:21
virtual void defineSegmenter(const std::string &name, SegmenterInterface *segmenter)=0
Define a document segmenter by name.
virtual analyzer::SegmenterOptions getSegmenterOptions(const std::string &scheme) const =0
Get the options for a document segmenter for a specific document type.
virtual const PatternMatcherInterface * getPatternMatcher(const std::string &name) const =0
Get a const reference to a pattern lexer.
virtual std::string getResourceFilePath(const std::string &filename) const =0
Get the absolute path of a resource file.
Interface for building the automaton for detecting patterns of tokens in a document stream...
Definition: tokenMarkupInstanceInterface.hpp:24
Addresses a normalizer.
Definition: textProcessorInterface.hpp:163
virtual const NormalizerFunctionInterface * getNormalizer(const std::string &name) const =0
Get a const reference to a normalizer object that implements the transformation of a token into a ter...
Addresses an aggregator.
Definition: textProcessorInterface.hpp:164
virtual void definePatternMatcher(const std::string &name, PatternMatcherInterface *matcher)=0
Define a pattern matcher by name.
virtual std::vector< std::string > getFunctionList(const FunctionType &type) const =0
Get a list of all functions of a specific type available.
static const char * functionTypeName(FunctionType t)
Definition: textProcessorInterface.hpp:168
Interface for the normalizer constructor.
Definition: normalizerFunctionInterface.hpp:26
virtual void defineDocumentClassDetector(DocumentClassDetectorInterface *detector)=0
Define a content detector.
virtual const SegmenterInterface * getSegmenterByMimeType(const std::string &mimetype) const =0
Get a document segmenter object reference that is able to process the specified MIME type...
virtual const PosTaggerInterface * getPosTagger() const =0
Get the default POS tagger interface to do POS tagging of documents.
Interface for the construction of a POS tagger instance for a specified segmenter.
Definition: posTaggerInterface.hpp:25
Addresses a pattern matcher.
Definition: textProcessorInterface.hpp:166
virtual TokenMarkupInstanceInterface * createTokenMarkupInstance() const =0
Create an interface for markup of content.
virtual ~TextProcessorInterface()
Desructor.
Definition: textProcessorInterface.hpp:53
Options to stear the segmenter behaviour or the structure of input if not self defined in the documen...
Addresses a pattern lexer.
Definition: textProcessorInterface.hpp:165
Addresses a tokenizer.
Definition: textProcessorInterface.hpp:162
FunctionType
Function type for fetching descriptions of available functions.
Definition: textProcessorInterface.hpp:159
Defines an interface for creating instances of programs for document segmentation.
Definition: segmenterInterface.hpp:29
Interface for the object providing tokenizers and normalizers used for creating terms from segments o...
Definition: textProcessorInterface.hpp:49
virtual const PatternLexerInterface * getPatternLexer(const std::string &name) const =0
Get a const reference to a pattern lexer.