Defines a program for analyzing a document, splitting it into normalized terms that can be fed to the strus IR engine. More...

#include <documentAnalyzerInstanceInterface.hpp>

Public Member Functions
virtual	~DocumentAnalyzerInstanceInterface ()
	Destructor. More...

virtual void	addSearchIndexFeature (const std::string &type, const std::string &selectexpr, TokenizerFunctionInstanceInterface tokenizer, const std::vector< NormalizerFunctionInstanceInterface > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
	Declare a feature to be put into the search index. More...

virtual void	addForwardIndexFeature (const std::string &type, const std::string &selectexpr, TokenizerFunctionInstanceInterface tokenizer, const std::vector< NormalizerFunctionInstanceInterface > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
	Declare a feature to be put into the forward index used for summarization extraction. More...

virtual void	defineMetaData (const std::string &metaname, const std::string &selectexpr, TokenizerFunctionInstanceInterface tokenizer, const std::vector< NormalizerFunctionInstanceInterface > &normalizers)=0
	Declare a feature to be put into the meta data table used for restrictions, weighting and summarization. More...

virtual void	defineAggregatedMetaData (const std::string &metaname, AggregatorFunctionInstanceInterface *statfunc)=0
	Declare some collected statistics of the document to be put into the meta data table used for restrictions, weighting and summarization. More...

virtual void	defineAttribute (const std::string &attribname, const std::string &selectexpr, TokenizerFunctionInstanceInterface tokenizer, const std::vector< NormalizerFunctionInstanceInterface > &normalizers)=0
	Declare a feature to be defined as document attribute used for summarization (document title, document id, etc.) More...

virtual void	defineSubDocument (const std::string &subDocumentTypeName, const std::string &selectexpr)=0
	Declare a sub document for the handling of multi part documents in an analyzed content. More...

virtual void	defineSubContent (const std::string &selectexpr, const analyzer::DocumentClass &documentClass)=0
	Declare a sub content of the document that has to be processed with a different segmenter. More...

virtual void	addPatternLexem (const std::string &termtype, const std::string &selectexpr, TokenizerFunctionInstanceInterface tokenizer, const std::vector< NormalizerFunctionInstanceInterface > &normalizers, int priority)=0
	Declare an element to be used as lexem by post processing pattern matching but not put into the result of document analysis. More...

virtual void	defineTokenPatternMatcher (const std::string &patternName, PatternMatcherInstanceInterface matcher, PatternTermFeederInstanceInterface feeder)=0
	Declare a pattern matcher on the document terms of pattern lexems after normalization. More...

virtual void	defineContentPatternMatcher (const std::string &patternName, PatternMatcherInstanceInterface matcher, PatternLexerInstanceInterface lexer, const std::vector< std::string > &selectexpr)=0
	Declare a pattern matcher on the document features after other document analysis. More...

virtual void	addSearchIndexFeatureFromPatternMatch (const std::string &type, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
	Declare a feature to be put into the search index derived from a pattern matcher result item. More...

virtual void	addForwardIndexFeatureFromPatternMatch (const std::string &type, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers, int priority, const analyzer::FeatureOptions &options)=0
	Declare a feature to be put into the forward index derived from a pattern matcher result item. More...

virtual void	defineMetaDataFromPatternMatch (const std::string &metaname, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers)=0
	Declare a feature to be put into the meta data table for restrictions, weighting and summarization, derived from a pattern matcher result item. More...

virtual void	defineAttributeFromPatternMatch (const std::string &attribname, const std::string &patternName, const std::vector< NormalizerFunctionInstanceInterface * > &normalizers)=0
	Declare a feature to be defined as document attribute used for summarization, derived from a pattern matcher result item. More...

virtual analyzer::Document	analyze (const std::string &content, const analyzer::DocumentClass &dclass) const =0
	Segment and tokenize a document, assign types to tokens and metadata and normalize their values. More...

virtual DocumentAnalyzerContextInterface *	createContext (const analyzer::DocumentClass &dclass) const =0
	Create the context used for analyzing multipart or very big documents. More...

virtual analyzer::DocumentAnalyzerView	view () const =0
	Return a structure with all definitions for introspection. More...

Detailed Description

Defines a program for analyzing a document, splitting it into normalized terms that can be fed to the strus IR engine.

Constructor & Destructor Documentation

virtual strus::DocumentAnalyzerInstanceInterface::~DocumentAnalyzerInstanceInterface ( )

inlinevirtual

Destructor.

Member Function Documentation

virtual void strus::DocumentAnalyzerInstanceInterface::addForwardIndexFeature	(	const std::string &	type,
		const std::string &	selectexpr,
		TokenizerFunctionInstanceInterface *	tokenizer,
		const std::vector< NormalizerFunctionInstanceInterface * > &	normalizers,
		int	priority,
		const analyzer::FeatureOptions &	options
	)

pure virtual

Declare a feature to be put into the forward index used for summarization extraction.

Parameters

[in]	type	type name of the feature
[in]	selectexpr	an expression that decribes what elements are taken from a document for this feature (tag selection in abbreviated syntax of XPath)
[in]	tokenizer	tokenizer (ownership passed to this) to use for this feature
[in]	normalizers	list of normalizers (ownership of elements passed to this) to use for this feature
[in]	priority	element priority analyzer element with lower priority are ousted if they are completely covered by elements with higher priority
[in]	options	options that stear the document analysis result (e.g. influence the assingment of document position of terms produced)

virtual void strus::DocumentAnalyzerInstanceInterface::addForwardIndexFeatureFromPatternMatch	(	const std::string &	type,
		const std::string &	patternName,
		const std::vector< NormalizerFunctionInstanceInterface * > &	normalizers,
		int	priority,
		const analyzer::FeatureOptions &	options
	)

pure virtual

Declare a feature to be put into the forward index derived from a pattern matcher result item.

Parameters

[in]	type	type name of the feature
[in]	patternName	type name of the pattern match result or result item
[in]	normalizers	list of normalizers (element ownership passed to this) to use for this feature
[in]	priority	element priority analyzer element with lower priority are ousted if they are completely covered by elements with higher priority
[in]	options	(only for pre processing patterns) options that stear the document analysis result, e.g. influence the assingment of document position of terms produced

virtual void strus::DocumentAnalyzerInstanceInterface::addPatternLexem	(	const std::string &	termtype,
		const std::string &	selectexpr,
		TokenizerFunctionInstanceInterface *	tokenizer,
		const std::vector< NormalizerFunctionInstanceInterface * > &	normalizers,
		int	priority
	)

pure virtual

Declare an element to be used as lexem by post processing pattern matching but not put into the result of document analysis.

Parameters

[in]	termtype	term type name of the lexem to be feed to the pattern matching
[in]	selectexpr	an expression that decribes what elements are taken from a document for this feature (tag selection in abbreviated syntax of XPath)
[in]	tokenizer	tokenizer (ownership passed to this) to use for this feature
[in]	normalizers	list of normalizers (element ownership passed to this) to use for this feature
[in]	priority	element priority analyzer element with lower priority are ousted if they are completely covered by elements with higher priority

virtual void strus::DocumentAnalyzerInstanceInterface::addSearchIndexFeature	(	const std::string &	type,
		const std::string &	selectexpr,
		TokenizerFunctionInstanceInterface *	tokenizer,
		const std::vector< NormalizerFunctionInstanceInterface * > &	normalizers,
		int	priority,
		const analyzer::FeatureOptions &	options
	)

pure virtual

Declare a feature to be put into the search index.

Parameters

[in]	type	type name of the feature
[in]	selectexpr	an expression that decribes what elements are taken from a document for this feature (tag selection in abbreviated syntax of XPath)
[in]	tokenizer	tokenizer (ownership passed to this) to use for this feature
[in]	normalizers	list of normalizers (element ownership passed to this) to use for this feature
[in]	priority	element priority analyzer element with lower priority are ousted if they are completely covered by elements with higher priority
[in]	options	options that stear the document analysis result (e.g. influence the assingment of document position of terms produced)

virtual void strus::DocumentAnalyzerInstanceInterface::addSearchIndexFeatureFromPatternMatch	(	const std::string &	type,
		const std::string &	patternName,
		const std::vector< NormalizerFunctionInstanceInterface * > &	normalizers,
		int	priority,
		const analyzer::FeatureOptions &	options
	)

pure virtual

Declare a feature to be put into the search index derived from a pattern matcher result item.

Parameters

[in]	type	type name of the feature
[in]	patternName	type name of the pattern match result or result item
[in]	normalizers	list of normalizers (element ownership passed to this) to use for this feature
[in]	priority	element priority analyzer element with lower priority are ousted if they are completely covered by elements with higher priority
[in]	options	(only for pre processing patterns) options that stear the document analysis result, e.g. influence the assingment of document position of terms produced

virtual analyzer::Document strus::DocumentAnalyzerInstanceInterface::analyze	(	const std::string &	content,
		const analyzer::DocumentClass &	dclass
	)		const

pure virtual

Segment and tokenize a document, assign types to tokens and metadata and normalize their values.

Parameters

[in]	content	document content string to analyze
[in]	dclass	description of the content type and encoding to process

Returns: the analyzed document

Remarks: Do not use this function in case of a multipart document (defined with 'defineSubDocument(const std::string&,const std::string&)') because you get only one sub document analyzed. Use the interface created with 'createDocumentAnalyzerContext(std::istream&)const' instead.

virtual DocumentAnalyzerContextInterface* strus::DocumentAnalyzerInstanceInterface::createContext ( const analyzer::DocumentClass & dclass ) const

pure virtual

Create the context used for analyzing multipart or very big documents.

Parameters

[in] dclass description of the content type and encoding to process

Returns: the document analyzer context (with ownership)

virtual void strus::DocumentAnalyzerInstanceInterface::defineAggregatedMetaData	(	const std::string &	metaname,
		AggregatorFunctionInstanceInterface *	statfunc
	)

pure virtual

Declare some collected statistics of the document to be put into the meta data table used for restrictions, weighting and summarization.

Parameters

[in]	metaname	name of the column in the meta data table this feature is written to
[in]	statfunc	function (ownership passed to this) that decribes how the value to be inserted is calculated from a document

Remarks: The field in the meta data table must exist before this function is called

virtual void strus::DocumentAnalyzerInstanceInterface::defineAttribute	(	const std::string &	attribname,
		const std::string &	selectexpr,
		TokenizerFunctionInstanceInterface *	tokenizer,
		const std::vector< NormalizerFunctionInstanceInterface * > &	normalizers
	)

pure virtual

Declare a feature to be defined as document attribute used for summarization (document title, document id, etc.)

Parameters

[in]	attribname	name of the document attribute this feature is written as.
[in]	selectexpr	an expression that decribes what elements are taken from a document for this feature (tag selection in abbreviated syntax of XPath)
[in]	tokenizer	tokenizer (ownership passed to this) to use for this feature
[in]	normalizers	list of normalizers (ownership of elements passed to this) to use for this feature

Remarks: Attributes must be defined uniquely per document

virtual void strus::DocumentAnalyzerInstanceInterface::defineAttributeFromPatternMatch	(	const std::string &	attribname,
		const std::string &	patternName,
		const std::vector< NormalizerFunctionInstanceInterface * > &	normalizers
	)

pure virtual

Declare a feature to be defined as document attribute used for summarization, derived from a pattern matcher result item.

Parameters

[in]	attribname	name of the document attribute assigned
[in]	patternName	type name of the pattern match result or result item
[in]	normalizers	list of normalizers (element ownership passed to this) to use for this feature

Remarks: The field in the meta data table must exist before this function is called

virtual void strus::DocumentAnalyzerInstanceInterface::defineContentPatternMatcher	(	const std::string &	patternName,
		PatternMatcherInstanceInterface *	matcher,
		PatternLexerInstanceInterface *	lexer,
		const std::vector< std::string > &	selectexpr
	)

pure virtual

Declare a pattern matcher on the document features after other document analysis.

Parameters

[in]	patternName	name of the type to assign to the pattern matching results
[in]	matcher	pattern matcher compiled (ownership passed to this)
[in]	lexer	lexer that tokenizes a document segment as input of pattern matching (ownership passed to this)
[in]	selectexpr	list of selection expressions as input of the pattern matching

virtual void strus::DocumentAnalyzerInstanceInterface::defineMetaData	(	const std::string &	metaname,
		const std::string &	selectexpr,
		TokenizerFunctionInstanceInterface *	tokenizer,
		const std::vector< NormalizerFunctionInstanceInterface * > &	normalizers
	)

pure virtual

Declare a feature to be put into the meta data table used for restrictions, weighting and summarization.

Parameters

[in]	metaname	name of the column in the meta data table this feature is written to
[in]	selectexpr	an expression that decribes what elements are taken from a document for this feature (tag selection in abbreviated syntax of XPath)
[in]	tokenizer	tokenizer (ownership passed to this) to use for this feature
[in]	normalizers	list of normalizers (ownership of elements passed to this) to use for this feature

Remarks: The field in the meta data table must exist before this function is called

virtual void strus::DocumentAnalyzerInstanceInterface::defineMetaDataFromPatternMatch	(	const std::string &	metaname,
		const std::string &	patternName,
		const std::vector< NormalizerFunctionInstanceInterface * > &	normalizers
	)

pure virtual

Declare a feature to be put into the meta data table for restrictions, weighting and summarization, derived from a pattern matcher result item.

Parameters

[in]	metaname	name of the column in the meta data table this feature is written to
[in]	patternName	type name of the pattern match result or result item
[in]	normalizers	list of normalizers (element ownership passed to this) to use for this feature

Remarks: The field in the meta data table must exist before this function is called

virtual void strus::DocumentAnalyzerInstanceInterface::defineSubContent	(	const std::string &	selectexpr,
		const analyzer::DocumentClass &	documentClass
	)

pure virtual

Declare a sub content of the document that has to be processed with a different segmenter.

Parameters

[in]	selectexpr	an expression that addresses the sub content declared
[in]	documentClass	defines the content type of the sub content

Remarks: Sub contents have to be defined before (!) any item with a selection expression referring to it.

virtual void strus::DocumentAnalyzerInstanceInterface::defineSubDocument	(	const std::string &	subDocumentTypeName,
		const std::string &	selectexpr
	)

pure virtual

Declare a sub document for the handling of multi part documents in an analyzed content.

Parameters

[in]	selectexpr	an expression that defines the content of the sub document declared
[in]	subDocumentTypeName	type name assinged to this sub document

Remarks: Sub documents are defined as the sections selected by the expression plus some data selected not belonging to any sub document.

virtual void strus::DocumentAnalyzerInstanceInterface::defineTokenPatternMatcher	(	const std::string &	patternName,
		PatternMatcherInstanceInterface *	matcher,
		PatternTermFeederInstanceInterface *	feeder
	)

pure virtual

Declare a pattern matcher on the document terms of pattern lexems after normalization.

Parameters

[in]	patternName	name of the type to assign to the pattern matching results
[in]	matcher	pattern matcher compiled (ownership passed to this)
[in]	feeder	feeder that maps document analysis term to pattern lexems as input of the matcher (ownership passed to this)

virtual analyzer::DocumentAnalyzerView strus::DocumentAnalyzerInstanceInterface::view ( ) const

pure virtual

Return a structure with all definitions for introspection.

Returns: the structure with all definitions for introspection

The documentation for this class was generated from the following file:

/home/patrick/Projects/github/strusAnalyzer/include/strus/documentAnalyzerInstanceInterface.hpp

Public Member Functions

Detailed Description

Constructor & Destructor Documentation

Member Function Documentation