A file for globally defined constants and classes. More...

#include "../CommonLib/utilit.h"
#include "../CommonLib/ddcLog.h"
#include "list"
#include "limits.h"
#include "../GraphanLib/GraphmatFile.h"
#include "../LemmatizerLib/Lemmatizers.h"
#include "../AgramtabLib/EngGramTab.h"
#include "../AgramtabLib/RusGramTab.h"
#include "../AgramtabLib/GerGramTab.h"
#include "../CommonLib/DDC_common.h"
#include "../CommonLib/ddcObject.h"
#include "../tinyxml/tinyxml.h"
#include "../ConcordLib/CCurl.h"

Include dependency graph for ConcCommon.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes
class	CShortOccurCache

struct	CShortOccurCache::CDataReference
	the structure holds a pointer to a vector of occurrences and its size More...

class	CFreeBiblIndexInterface

struct	CHitSortKey

struct	CDDCFilterWithBounds

struct	CHit

struct	CHitCompareByBreak
	compare hits by break-number (for query evaluation, e.g. CQueryBinaryOperationNode::hits_and_positions_intersection()) More...

Macros
#define	DDC_SORTKEY_MAXLEN 256

Typedefs
typedef DWORD	CTokenNo
	integer type CTokenNo is used to refer an index of a token in the corpus More...

typedef DWORD	CFileNo
	integer type CFileNo is used to refer to a single document (file) in the corpus More...

typedef map< string, CShortOccurCache >	CShortOccurCacheMap
	a type for index string to its occurrences More...

typedef vector< CTokenNo >	COccurrBuffer
	a type for holding occurrences during reading from the disk More...

Enumerations
enum	HitSortEnum { NoSort = 0, LessByDate = 1, GreaterByDate = 2, LessBySize = 3, GreaterBySize = 4, LessByFreeBiblField = 5, GreaterByFreeBiblField = 6, LessByRank = 7, GreaterByRank = 8, LessByMiddleContext = 9, GreaterByMiddleContext = 10, LessByLeftContext = 11, GreaterByLeftContext = 12, LessByRightContext = 13, GreaterByRightContext = 14, RandomSort = 15, LessByCountKey = 16, GreaterByCountKey = 17, LessByCountValue = 18, GreaterByCountValue = 19, LessByPruneKey = 20, GreaterByPruneKey = 21, HitSortsCount = 22 }

enum	HitSortOrderEnum { hsoDescending = -1, hsoNone = 0, hsoAscending = 1, hsoAscendingCountKeys = 2, hsoDescendingCountKeys = 3, hsoAscendingCountValues = 4, hsoDescendingCountValues = 5 }

enum	DDCFormatTypeEnum { DDC_ResultHTML, DDC_ResultText, DDC_ResultTable, DDC_ResultDocIds, DDC_ResultJson }
	FormatTypeEnum defines the format of output hits: More...

Functions
void	ddcInitGlobal (void)
	global intialization More...

bool	IsPruneFilterType (HitSortEnum e)

bool	IsNullSort (HitSortOrderEnum e)

bool	IsCountSort (HitSortOrderEnum e)

bool	IsCountKeySort (HitSortOrderEnum e)

bool	IsCountValueSort (HitSortOrderEnum e)

bool	InitConcordDicts ()
	initializes morphology dictionaries More...

void	FreeConcordDicts ()
	deletes morphology dictionaries More...

const CLemmatizer *	GetLemmatizerByLanguage (MorphLanguageEnum Langua)
	return a morphology dictionary by a language indentifier More...

const CAgramtab *	GetGramtabByLanguage (MorphLanguageEnum Langua)
	return a grammatical table by a language indentifier More...

string	GetIndexItemSetByVectorString (const vector< string > &TokenProperties, bool bRegexp)
	return a string representation of a set of token properties (in the format which is used in the index) More...

Variables
const char	globalFieldDelimeter = '\t'
	a globally defined delimeter, which is used to delimit fields in one record (the first field is always a token) More...

const string	PredefinedTableLineTag = "l"
	a globally defined xml-tag, which is used to separate records if CConcIndexator::m_IndexType is Free_Index More...

const string	ChunkIndexName = "chunk"
	a globally defined index name for chunks More...

const string	PredefinedFileBreakName = "file"
	a globally defined break collection name for corpus files More...

const string	PredefinedTextAreaBreakName = "textarea"
	a globally defined break collection name for text areas More...

const size_t	MaxShortOccurCacheSize = 1000000
	MaxShortOccurCacheSize is the upper bound of CShortOccurCache::m_Data.size() It is introduced to restrict memory usage. More...

const size_t	MaxBiblStringLen = 20000

static const char *	HitSortEnumNames [HitSortsCount]

static const char *	HitSortEnumStrings [HitSortsCount]

const string	MorphAnnotationsDelim = "#"
	a delimiter between morphological annotations More...

const string	MorphAnnotationsDelimRegExp = "[^#]*"
	a regular expression, which passes everything within one morphological annotation More...

Detailed Description

A file for globally defined constants and classes.

Macro Definition Documentation

◆ DDC_SORTKEY_MAXLEN

#define DDC_SORTKEY_MAXLEN 256

maximum length of string sort-keys passed from subcorpus servers to distributed server, including terminating NUL (see CConcHolder::GetHitIds() and CDDCServerListenHost::GetFirstHitsFromCorpora())

Referenced by CConcSession::GetCountIds(), CDDCBranchServer::GetFirstHitsFromCorpora(), and CConcSession::GetHitIds().

Typedef Documentation

◆ CTokenNo

typedef DWORD CTokenNo

integer type CTokenNo is used to refer an index of a token in the corpus

◆ CFileNo

typedef DWORD CFileNo

integer type CFileNo is used to refer to a single document (file) in the corpus

◆ CShortOccurCacheMap

typedef map<string,CShortOccurCache> CShortOccurCacheMap

a type for index string to its occurrences

◆ COccurrBuffer

typedef vector<CTokenNo> COccurrBuffer

a type for holding occurrences during reading from the disk

Enumeration Type Documentation

◆ HitSortEnum

enum HitSortEnum

HitSortEnum This enum defines the types of all possible orders which can be apllied to an output hit set.

Enumerator
NoSort	no sort operators, only filtering (used by #has_field[])
LessByDate	sort by the issue date (ascending)
GreaterByDate	sort by the issue date (descending)
LessBySize	sort by the size of the hit in tokens (ascending)
GreaterBySize	sort by the size of the hit in tokens (descending)
LessByFreeBiblField	sort by a free bibliographical field (ascending)
GreaterByFreeBiblField	sort by a free bibliographical field (descending)
LessByRank	sort by document (ascending)
GreaterByRank	sort by document rank (descending)
LessByMiddleContext	sort by central context (ascending)
GreaterByMiddleContext	sort by match context (descending)
LessByLeftContext	sort by left context (ascending)
GreaterByLeftContext	sort by left context (descending)
LessByRightContext	sort by right context (ascending)
GreaterByRightContext	sort by right context (descending)
RandomSort	sort by random key
LessByCountKey	sort by count()-key (ascending)
GreaterByCountKey	sort by count()-key (descending)
LessByCountValue	sort by count()-value (ascending)
GreaterByCountValue	sort by count()-value (descending)
LessByPruneKey	sort by #prune[]-key (ascending)
GreaterByPruneKey	sort by #prune[]-key (descending)
HitSortsCount	end-of-enum sentinel

◆ HitSortOrderEnum

enum HitSortOrderEnum

symbolic hit sort order constants

Enumerator
hsoDescending
hsoNone
hsoAscending
hsoAscendingCountKeys
hsoDescendingCountKeys
hsoAscendingCountValues
hsoDescendingCountValues

◆ DDCFormatTypeEnum

enum DDCFormatTypeEnum

FormatTypeEnum defines the format of output hits:

DDC_ResultHTML - output hits should be written in HTML-format; the tags "<STRONG><FONT COLOR=red>" are used for highlighting.
DDC_ResultText - output hits should be written in plain-format; the string "&&" is used to select the first highlighting position, "_" for the following positions are used for higlighting.
DDC_ResultTable - output hits should be written in table-format; there is no highlighting in the string at all, instead of this highligted words are written after the string
DDC_ResultJson - output hits should be written in json format

Enumerator
DDC_ResultHTML
DDC_ResultText
DDC_ResultTable
DDC_ResultDocIds
DDC_ResultJson

Function Documentation

◆ ddcInitGlobal()

void ddcInitGlobal ( void )

inline

global intialization

References ddcInitCurl(), and ddcInitLocale().

Referenced by main().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ IsPruneFilterType()

bool IsPruneFilterType ( HitSortEnum e )

inline

References GreaterByPruneKey, and LessByPruneKey.

Referenced by CQueryOptions::IsPruneFilter(), and CDDCFilterWithBounds::IsPruneFilter().

Here is the caller graph for this function:

◆ IsNullSort()

bool IsNullSort ( HitSortOrderEnum e )

inline

References hsoNone.

◆ IsCountSort()

bool IsCountSort ( HitSortOrderEnum e )

inline

References hsoAscendingCountKeys, and hsoDescendingCountValues.

Referenced by CFirstHitsQueryResult::CollectCounts(), CConcSession::GenerateHitStrings(), CDDCBranchServer::GetFirstHitsFromCorpora(), CConcSession::GetHitIds(), CConcSession::GetSortKeyHint(), CDDCLeafServer::handle__get_first_hits(), CDDCBranchServer::handle__get_first_hits(), CDDCBranchServer::handle__get_hit_strings(), CDDCLeafServer::handle__run_query(), CDDCBranchServer::RunDistributed(), CDDCBranchServer::RunGetFirstHits(), and CDDCBranchServer::UpdateNavCache().

Here is the caller graph for this function:

◆ IsCountKeySort()

bool IsCountKeySort ( HitSortOrderEnum e )

inline

References hsoAscendingCountKeys, and hsoDescendingCountKeys.

◆ IsCountValueSort()

bool IsCountValueSort ( HitSortOrderEnum e )

inline

References hsoAscendingCountValues, and hsoDescendingCountValues.

◆ InitConcordDicts()

bool InitConcordDicts ( )

initializes morphology dictionaries

References bEnglishMorph, bGermanMorph, bRussianMorph, ddcLogInfo, InitMorphologySystem(), and CExpc::m_strCause.

Referenced by do_start().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ FreeConcordDicts()

void FreeConcordDicts ( )

deletes morphology dictionaries

References engGramTab, engLemmatizer, gerGramTab, gerLemmatizer, rusGramTab, and rusLemmatizer.

Referenced by UnloadData().

Here is the caller graph for this function:

◆ GetLemmatizerByLanguage()

const CLemmatizer* GetLemmatizerByLanguage ( MorphLanguageEnum Langua )

return a morphology dictionary by a language indentifier

References bEnglishMorph, bGermanMorph, bRussianMorph, engLemmatizer, gerLemmatizer, InitMorphologySystem(), morphEnglish, morphGerman, morphRussian, and rusLemmatizer.

Referenced by TxMorph::expand(), GetGramInfosFromWord(), GetParadigmCollection(), and GetWordForms().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ GetGramtabByLanguage()

const CAgramtab* GetGramtabByLanguage ( MorphLanguageEnum Langua )

return a grammatical table by a language indentifier

References bEnglishMorph, bGermanMorph, bRussianMorph, engGramTab, gerGramTab, InitMorphologySystem(), morphEnglish, morphGerman, morphRussian, and rusGramTab.

Referenced by GetGramInfosFromWord(), GetGramInfoStr(), GetParadigmByGroups(), GetParadigmFromDictionary(), and GetStringByParadigm().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ GetIndexItemSetByVectorString()

string GetIndexItemSetByVectorString	(	const vector< string > &	TokenProperties,
		bool	bRegexp
	)

return a string representation of a set of token properties (in the format which is used in the index)

References MorphAnnotationsDelimRegExp.

Referenced by CQueryTokenNode::CreateMorphAnnotationPattern(), CreateMorphIndex(), CConcIndexator::IndexMorphXml(), and CHitCompareByBreak::operator()().

Here is the caller graph for this function:

Variable Documentation

◆ globalFieldDelimeter

const char globalFieldDelimeter = '\t'

a globally defined delimeter, which is used to delimit fields in one record (the first field is always a token)

Referenced by CConcIndexator::IndexMorphXml(), CStringIndexator::IndexOneToken(), CConcIndexator::IndexTextOrHtmlFile(), CStringIndexator::IndexTokenFixLongColumns(), and TruncateLongColumns().

◆ PredefinedTableLineTag

const string PredefinedTableLineTag = "l"

a globally defined xml-tag, which is used to separate records if CConcIndexator::m_IndexType is Free_Index

Referenced by GetCWBFormattedStringRecursive(), and CHitBorders::RegisterBorderIndices().

◆ ChunkIndexName

const string ChunkIndexName = "chunk"

a globally defined index name for chunks

Referenced by CQueryTokenNode::CreateChunkPattern(), CStringIndexator::GetIndicesString(), and CStringIndexator::RegisterChunkIndex().

◆ PredefinedFileBreakName

const string PredefinedFileBreakName = "file"

a globally defined break collection name for corpus files

Referenced by CQCount::CountLocal(), CConcIndexator::CreateAsUnion(), CConcSession::GenerateHitStrings(), CHitBorders::GetBorderIndicesString(), CConcSession::GetContext(), CConcSession::GetContextJson(), ConcIndexatorInvoker::IndexFile(), CTabFormatIndexator::ProcessBreakRecord(), CHitBorders::RegisterBorderIndices(), and CQueryOptions::toString().

◆ PredefinedTextAreaBreakName

const string PredefinedTextAreaBreakName = "textarea"

a globally defined break collection name for text areas

Referenced by CConcIndexator::IndexTextOrHtmlFile(), CHitBorders::RegisterBorderIndices(), and CConcSession::SetHitType().

◆ MaxShortOccurCacheSize

const size_t MaxShortOccurCacheSize = 1000000

MaxShortOccurCacheSize is the upper bound of CShortOccurCache::m_Data.size() It is introduced to restrict memory usage.

Referenced by CShortOccurCache::CouldContainMore().

◆ MaxBiblStringLen

const size_t MaxBiblStringLen = 20000

Referenced by CDocumentIterator::ReadTabFormatDocument(), CFreeBiblStringIndex::RegisterBiblStringItemId(), CBibliography::Sanitize(), and SanitizeBiblString().

◆ HitSortEnumNames

const char* HitSortEnumNames[HitSortsCount]

static

Initial value:

=
{
    "NoSort",
    "LessByDate",
    "GreaterByDate",
    "LessBySize",
    "GreaterBySize",
    "LessByFreeBiblField",
    "GreaterByFreeBiblField",
    "LessByRank",
    "GreaterByRank",
    "LessByMiddleContext",
    "GreaterByMiddleContext",
    "LessByLeftContext",
    "GreaterByLeftContext",
    "LessByRightContext",
    "GreaterByRightContext",
    "RandomSort",
    "LessByCountKey",
    "GreaterByCountKey",
    "LessByCountValue",
    "GreaterByCountValue",
    "LessByPruneKey",
    "GreaterByPruneKey"
}

symbolic names for HitSortEnum, for debugging

Referenced by CConcSession::GetAllHits(), and CQCount::jsonData().

◆ HitSortEnumStrings

const char* HitSortEnumStrings[HitSortsCount]

static

Initial value:

= {
    "no_sort",
    "asc_by_date",
    "desc_by_date",
    "asc_by_size",
    "desc_by_size",
    "asc",
    "desc",
    "asc_by_rank",
    "desc_by_rank",
    "asc_middle",
    "desc_middle",
    "asc_left",
    "desc_left",
    "asc_right",
    "desc_right",
    "random",
    "asc_by_key",
    "desc_by_key",
    "asc_by_count",
    "desc_by_count",
    "prune_asc",
    "prune_desc"
}

symbolic names for HitSortEnum, for stringification

Referenced by CQCount::countOptionsToString(), CQFSort::jsonType(), CConcSession::SortKeyLB(), CQFSort::toString(), CQFBiblSort::toString(), CQFContextSort::toString(), and CQFPrune::toString().

◆ MorphAnnotationsDelim

const string MorphAnnotationsDelim = "#"

a delimiter between morphological annotations

Referenced by GetGramInfosFromWord(), and CConcIndexator::IndexMorphXml().

◆ MorphAnnotationsDelimRegExp

const string MorphAnnotationsDelimRegExp = "[^#]*"

a regular expression, which passes everything within one morphological annotation

Referenced by GetIndexItemSetByVectorString().

Classes

Macros

Typedefs

Enumerations

Functions

Variables

Detailed Description

Macro Definition Documentation

◆ DDC_SORTKEY_MAXLEN

Typedef Documentation

◆ CTokenNo

◆ CFileNo

◆ CShortOccurCacheMap

◆ COccurrBuffer

Enumeration Type Documentation

◆ HitSortEnum

◆ HitSortOrderEnum

◆ DDCFormatTypeEnum

Function Documentation

◆ ddcInitGlobal()

◆ IsPruneFilterType()

◆ IsNullSort()

◆ IsCountSort()

◆ IsCountKeySort()

◆ IsCountValueSort()

◆ InitConcordDicts()

◆ FreeConcordDicts()

◆ GetLemmatizerByLanguage()

◆ GetGramtabByLanguage()

◆ GetIndexItemSetByVectorString()

Variable Documentation

◆ globalFieldDelimeter

◆ PredefinedTableLineTag

◆ ChunkIndexName

◆ PredefinedFileBreakName

◆ PredefinedTextAreaBreakName

◆ MaxShortOccurCacheSize

◆ MaxBiblStringLen

◆ HitSortEnumNames

◆ HitSortEnumStrings

◆ MorphAnnotationsDelim

◆ MorphAnnotationsDelimRegExp