ddc
Classes | Macros | Typedefs | Enumerations | Functions | Variables
ConcCommon.h File Reference

A file for globally defined constants and classes. More...

#include "../CommonLib/utilit.h"
#include "../CommonLib/ddcLog.h"
#include "list"
#include "limits.h"
#include "../GraphanLib/GraphmatFile.h"
#include "../LemmatizerLib/Lemmatizers.h"
#include "../AgramtabLib/EngGramTab.h"
#include "../AgramtabLib/RusGramTab.h"
#include "../AgramtabLib/GerGramTab.h"
#include "../CommonLib/DDC_common.h"
#include "../CommonLib/ddcObject.h"
#include "../tinyxml/tinyxml.h"
#include "../ConcordLib/CCurl.h"
Include dependency graph for ConcCommon.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  CShortOccurCache
 
struct  CShortOccurCache::CDataReference
 the structure holds a pointer to a vector of occurrences and its size More...
 
class  CFreeBiblIndexInterface
 
struct  CHitSortKey
 
struct  CDDCFilterWithBounds
 
struct  CHit
 
struct  CHitCompareByBreak
 compare hits by break-number (for query evaluation, e.g. CQueryBinaryOperationNode::hits_and_positions_intersection()) More...
 

Macros

#define DDC_SORTKEY_MAXLEN   256
 

Typedefs

typedef DWORD CTokenNo
 integer type CTokenNo is used to refer an index of a token in the corpus More...
 
typedef DWORD CFileNo
 integer type CFileNo is used to refer to a single document (file) in the corpus More...
 
typedef map< string, CShortOccurCacheCShortOccurCacheMap
 a type for index string to its occurrences More...
 
typedef vector< CTokenNoCOccurrBuffer
 a type for holding occurrences during reading from the disk More...
 

Enumerations

enum  HitSortEnum {
  NoSort = 0, LessByDate = 1, GreaterByDate = 2, LessBySize = 3,
  GreaterBySize = 4, LessByFreeBiblField = 5, GreaterByFreeBiblField = 6, LessByRank = 7,
  GreaterByRank = 8, LessByMiddleContext = 9, GreaterByMiddleContext = 10, LessByLeftContext = 11,
  GreaterByLeftContext = 12, LessByRightContext = 13, GreaterByRightContext = 14, RandomSort = 15,
  LessByCountKey = 16, GreaterByCountKey = 17, LessByCountValue = 18, GreaterByCountValue = 19,
  LessByPruneKey = 20, GreaterByPruneKey = 21, HitSortsCount = 22
}
 
enum  HitSortOrderEnum {
  hsoDescending = -1, hsoNone = 0, hsoAscending = 1, hsoAscendingCountKeys = 2,
  hsoDescendingCountKeys = 3, hsoAscendingCountValues = 4, hsoDescendingCountValues = 5
}
 
enum  DDCFormatTypeEnum {
  DDC_ResultHTML, DDC_ResultText, DDC_ResultTable, DDC_ResultDocIds,
  DDC_ResultJson
}
 FormatTypeEnum defines the format of output hits: More...
 

Functions

void ddcInitGlobal (void)
 global intialization More...
 
bool IsPruneFilterType (HitSortEnum e)
 
bool IsNullSort (HitSortOrderEnum e)
 
bool IsCountSort (HitSortOrderEnum e)
 
bool IsCountKeySort (HitSortOrderEnum e)
 
bool IsCountValueSort (HitSortOrderEnum e)
 
bool InitConcordDicts ()
 initializes morphology dictionaries More...
 
void FreeConcordDicts ()
 deletes morphology dictionaries More...
 
const CLemmatizerGetLemmatizerByLanguage (MorphLanguageEnum Langua)
 return a morphology dictionary by a language indentifier More...
 
const CAgramtabGetGramtabByLanguage (MorphLanguageEnum Langua)
 return a grammatical table by a language indentifier More...
 
string GetIndexItemSetByVectorString (const vector< string > &TokenProperties, bool bRegexp)
 return a string representation of a set of token properties (in the format which is used in the index) More...
 

Variables

const char globalFieldDelimeter = '\t'
 a globally defined delimeter, which is used to delimit fields in one record (the first field is always a token) More...
 
const string PredefinedTableLineTag = "l"
 a globally defined xml-tag, which is used to separate records if CConcIndexator::m_IndexType is Free_Index More...
 
const string ChunkIndexName = "chunk"
 a globally defined index name for chunks More...
 
const string PredefinedFileBreakName = "file"
 a globally defined break collection name for corpus files More...
 
const string PredefinedTextAreaBreakName = "textarea"
 a globally defined break collection name for text areas More...
 
const size_t MaxShortOccurCacheSize = 1000000
 MaxShortOccurCacheSize is the upper bound of CShortOccurCache::m_Data.size() It is introduced to restrict memory usage. More...
 
const size_t MaxBiblStringLen = 20000
 
static const char * HitSortEnumNames [HitSortsCount]
 
static const char * HitSortEnumStrings [HitSortsCount]
 
const string MorphAnnotationsDelim = "#"
 a delimiter between morphological annotations More...
 
const string MorphAnnotationsDelimRegExp = "[^#]*"
 a regular expression, which passes everything within one morphological annotation More...
 

Detailed Description

A file for globally defined constants and classes.

Macro Definition Documentation

◆ DDC_SORTKEY_MAXLEN

#define DDC_SORTKEY_MAXLEN   256

maximum length of string sort-keys passed from subcorpus servers to distributed server, including terminating NUL (see CConcHolder::GetHitIds() and CDDCServerListenHost::GetFirstHitsFromCorpora())

Referenced by CConcSession::GetCountIds(), CDDCBranchServer::GetFirstHitsFromCorpora(), and CConcSession::GetHitIds().

Typedef Documentation

◆ CTokenNo

typedef DWORD CTokenNo

integer type CTokenNo is used to refer an index of a token in the corpus

◆ CFileNo

typedef DWORD CFileNo

integer type CFileNo is used to refer to a single document (file) in the corpus

◆ CShortOccurCacheMap

a type for index string to its occurrences

◆ COccurrBuffer

typedef vector<CTokenNo> COccurrBuffer

a type for holding occurrences during reading from the disk

Enumeration Type Documentation

◆ HitSortEnum

HitSortEnum This enum defines the types of all possible orders which can be apllied to an output hit set.

Enumerator
NoSort 

no sort operators, only filtering (used by #has_field[])

LessByDate 

sort by the issue date (ascending)

GreaterByDate 

sort by the issue date (descending)

LessBySize 

sort by the size of the hit in tokens (ascending)

GreaterBySize 

sort by the size of the hit in tokens (descending)

LessByFreeBiblField 

sort by a free bibliographical field (ascending)

GreaterByFreeBiblField 

sort by a free bibliographical field (descending)

LessByRank 

sort by document (ascending)

GreaterByRank 

sort by document rank (descending)

LessByMiddleContext 

sort by central context (ascending)

GreaterByMiddleContext 

sort by match context (descending)

LessByLeftContext 

sort by left context (ascending)

GreaterByLeftContext 

sort by left context (descending)

LessByRightContext 

sort by right context (ascending)

GreaterByRightContext 

sort by right context (descending)

RandomSort 

sort by random key

LessByCountKey 

sort by count()-key (ascending)

GreaterByCountKey 

sort by count()-key (descending)

LessByCountValue 

sort by count()-value (ascending)

GreaterByCountValue 

sort by count()-value (descending)

LessByPruneKey 

sort by #prune[]-key (ascending)

GreaterByPruneKey 

sort by #prune[]-key (descending)

HitSortsCount 

end-of-enum sentinel

◆ HitSortOrderEnum

symbolic hit sort order constants

Enumerator
hsoDescending 
hsoNone 
hsoAscending 
hsoAscendingCountKeys 
hsoDescendingCountKeys 
hsoAscendingCountValues 
hsoDescendingCountValues 

◆ DDCFormatTypeEnum

FormatTypeEnum defines the format of output hits:

  • DDC_ResultHTML - output hits should be written in HTML-format; the tags "<STRONG><FONT COLOR=red>" are used for highlighting.
  • DDC_ResultText - output hits should be written in plain-format; the string "&&" is used to select the first highlighting position, "_" for the following positions are used for higlighting.
  • DDC_ResultTable - output hits should be written in table-format; there is no highlighting in the string at all, instead of this highligted words are written after the string
  • DDC_ResultJson - output hits should be written in json format
Enumerator
DDC_ResultHTML 
DDC_ResultText 
DDC_ResultTable 
DDC_ResultDocIds 
DDC_ResultJson 

Function Documentation

◆ ddcInitGlobal()

void ddcInitGlobal ( void  )
inline

global intialization

References ddcInitCurl(), and ddcInitLocale().

Referenced by main().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ IsPruneFilterType()

bool IsPruneFilterType ( HitSortEnum  e)
inline

References GreaterByPruneKey, and LessByPruneKey.

Referenced by CQueryOptions::IsPruneFilter(), and CDDCFilterWithBounds::IsPruneFilter().

Here is the caller graph for this function:

◆ IsNullSort()

bool IsNullSort ( HitSortOrderEnum  e)
inline

References hsoNone.

◆ IsCountSort()

bool IsCountSort ( HitSortOrderEnum  e)
inline

◆ IsCountKeySort()

bool IsCountKeySort ( HitSortOrderEnum  e)
inline

◆ IsCountValueSort()

bool IsCountValueSort ( HitSortOrderEnum  e)
inline

◆ InitConcordDicts()

bool InitConcordDicts ( )

initializes morphology dictionaries

References bEnglishMorph, bGermanMorph, bRussianMorph, ddcLogInfo, InitMorphologySystem(), and CExpc::m_strCause.

Referenced by do_start().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ FreeConcordDicts()

void FreeConcordDicts ( )

deletes morphology dictionaries

References engGramTab, engLemmatizer, gerGramTab, gerLemmatizer, rusGramTab, and rusLemmatizer.

Referenced by UnloadData().

Here is the caller graph for this function:

◆ GetLemmatizerByLanguage()

const CLemmatizer* GetLemmatizerByLanguage ( MorphLanguageEnum  Langua)

return a morphology dictionary by a language indentifier

References bEnglishMorph, bGermanMorph, bRussianMorph, engLemmatizer, gerLemmatizer, InitMorphologySystem(), morphEnglish, morphGerman, morphRussian, and rusLemmatizer.

Referenced by TxMorph::expand(), GetGramInfosFromWord(), GetParadigmCollection(), and GetWordForms().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ GetGramtabByLanguage()

const CAgramtab* GetGramtabByLanguage ( MorphLanguageEnum  Langua)

return a grammatical table by a language indentifier

References bEnglishMorph, bGermanMorph, bRussianMorph, engGramTab, gerGramTab, InitMorphologySystem(), morphEnglish, morphGerman, morphRussian, and rusGramTab.

Referenced by GetGramInfosFromWord(), GetGramInfoStr(), GetParadigmByGroups(), GetParadigmFromDictionary(), and GetStringByParadigm().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ GetIndexItemSetByVectorString()

string GetIndexItemSetByVectorString ( const vector< string > &  TokenProperties,
bool  bRegexp 
)

return a string representation of a set of token properties (in the format which is used in the index)

References MorphAnnotationsDelimRegExp.

Referenced by CQueryTokenNode::CreateMorphAnnotationPattern(), CreateMorphIndex(), CConcIndexator::IndexMorphXml(), and CHitCompareByBreak::operator()().

Here is the caller graph for this function:

Variable Documentation

◆ globalFieldDelimeter

const char globalFieldDelimeter = '\t'

a globally defined delimeter, which is used to delimit fields in one record (the first field is always a token)

Referenced by CConcIndexator::IndexMorphXml(), CStringIndexator::IndexOneToken(), CConcIndexator::IndexTextOrHtmlFile(), CStringIndexator::IndexTokenFixLongColumns(), and TruncateLongColumns().

◆ PredefinedTableLineTag

const string PredefinedTableLineTag = "l"

a globally defined xml-tag, which is used to separate records if CConcIndexator::m_IndexType is Free_Index

Referenced by GetCWBFormattedStringRecursive(), and CHitBorders::RegisterBorderIndices().

◆ ChunkIndexName

const string ChunkIndexName = "chunk"

◆ PredefinedFileBreakName

const string PredefinedFileBreakName = "file"

◆ PredefinedTextAreaBreakName

const string PredefinedTextAreaBreakName = "textarea"

a globally defined break collection name for text areas

Referenced by CConcIndexator::IndexTextOrHtmlFile(), CHitBorders::RegisterBorderIndices(), and CConcSession::SetHitType().

◆ MaxShortOccurCacheSize

const size_t MaxShortOccurCacheSize = 1000000

MaxShortOccurCacheSize is the upper bound of CShortOccurCache::m_Data.size() It is introduced to restrict memory usage.

Referenced by CShortOccurCache::CouldContainMore().

◆ MaxBiblStringLen

const size_t MaxBiblStringLen = 20000

◆ HitSortEnumNames

const char* HitSortEnumNames[HitSortsCount]
static
Initial value:
=
{
"NoSort",
"LessByDate",
"GreaterByDate",
"LessBySize",
"GreaterBySize",
"LessByFreeBiblField",
"GreaterByFreeBiblField",
"LessByRank",
"GreaterByRank",
"LessByMiddleContext",
"GreaterByMiddleContext",
"LessByLeftContext",
"GreaterByLeftContext",
"LessByRightContext",
"GreaterByRightContext",
"RandomSort",
"LessByCountKey",
"GreaterByCountKey",
"LessByCountValue",
"GreaterByCountValue",
"LessByPruneKey",
"GreaterByPruneKey"
}

symbolic names for HitSortEnum, for debugging

Referenced by CConcSession::GetAllHits(), and CQCount::jsonData().

◆ HitSortEnumStrings

const char* HitSortEnumStrings[HitSortsCount]
static
Initial value:
= {
"no_sort",
"asc_by_date",
"desc_by_date",
"asc_by_size",
"desc_by_size",
"asc",
"desc",
"asc_by_rank",
"desc_by_rank",
"asc_middle",
"desc_middle",
"asc_left",
"desc_left",
"asc_right",
"desc_right",
"random",
"asc_by_key",
"desc_by_key",
"asc_by_count",
"desc_by_count",
"prune_asc",
"prune_desc"
}

symbolic names for HitSortEnum, for stringification

Referenced by CQCount::countOptionsToString(), CQFSort::jsonType(), CConcSession::SortKeyLB(), CQFSort::toString(), CQFBiblSort::toString(), CQFContextSort::toString(), and CQFPrune::toString().

◆ MorphAnnotationsDelim

const string MorphAnnotationsDelim = "#"

a delimiter between morphological annotations

Referenced by GetGramInfosFromWord(), and CConcIndexator::IndexMorphXml().

◆ MorphAnnotationsDelimRegExp

const string MorphAnnotationsDelimRegExp = "[^#]*"

a regular expression, which passes everything within one morphological annotation

Referenced by GetIndexItemSetByVectorString().