ddc
|
#include <Concordance.h>
Public Member Functions | |
CConcordance () | |
~CConcordance () | |
void | RegisterIndicesToShow (const string &IndexListStr) |
wrapper for IndicesToShow option; now accepts integer position (min=1), long name, short name, or alias More... | |
string | GetIndicesToShowStr (bool Positional=false) const |
get opt-style string declaration for IndicesToShow (empty if default) More... | |
RML_RE::Options | GetRegexOptions () const |
return default pcre regex options More... | |
bool | IsDwdsCorpusInterface () const |
return true, if DDC outputs results in DWDS format More... | |
bool | IsGutenbergInterface () const |
return true, if DDC outputs results in Gutenberg project format More... | |
bool | HasContextOperator () const |
return true, if query context operator (#Cntxt) is switched off More... | |
bool | UseDwdsThesaurus () const |
return true, if DWDS thesaurus is enabled (index "Thes") More... | |
bool | OutputBibliographyOfHits () const |
return true, if DDC should output bibliographical information for hits instead of corpus file names More... | |
bool | IndexPunctuation () const |
wrapper for m_bIndexPunctuation More... | |
string | GetHtmlReference (size_t posFile) const |
get an HTML formatted reference to a corpus file More... | |
string | GetShortFilename (size_t posFile) const |
get a reference to a corpus file without the common left prefix More... | |
string | GetFileNameForCorpusFileNames () const |
get file name for storing corpus file names More... | |
string | GetFileNameForMaskedFiles () const |
get file name for masked files (strings) More... | |
string | GetFileNameForMaskedFileIds () const |
get file name for masked files (IDs, binary) More... | |
vector< string > | GetTokenFields (const COutputToken &tok) |
parse a delimited token into fields by splitting on m_InterpDelimiter More... | |
size_t | GetCorpusFilesCount () const |
get the number of indexed corpus files More... | |
string | GetCorpusFile (CFileNo FileNo) const |
get corpus file by index More... | |
size_t | GetMaskedFilesCount () const |
get count of masked files More... | |
void | LoadSourceFilesAndOptions (string FileName, bool reallyReadSourceFiles=true) |
load list of source files and parses option file (*.opt) More... | |
void | LoadCorpusFiles () |
load list of corpus files (*.con) More... | |
string | GetCommonFilePrefix () const |
(re-)compute common prefix of corpus files in m_CorpusFiles More... | |
void | LoadMaskedFiles () |
load list of masked (deleted)Corpus File Definition More... | |
void | LoadOptionsFromFile (const string &OptFile) |
loads options from a named file (calls LoadOptionsFromString()) More... | |
void | LoadProject (string FileName, bool includeSourceFiles=true) |
loads everything More... | |
time_t | UpdateTimestamp (const char *filename) |
update m_Timestamp to mtime of Filename if it is newer than the current m_Timestamp; More... | |
void | DumpBibliography (FILE *f=stdout) const |
dump loaded metadata to a stream (JSON) More... | |
string | DumpFileBibliography (DWORD FileNo) const |
dump file metadata to a string (JSON) More... | |
void | DumpFileIndexJson (DWORD FileNo, FILE *f=stdout) const |
dump file data to a file (JSON) More... | |
void | DumpIndex (string dirname, CIndexDumpFormat fmt=idfJson) const |
dump index data to a directory (format-dependent) More... | |
void | DumpIndexToSingleTabFile (FILE *outfp) const |
dump index data to a single tab formatted file More... | |
bool | SaveOptions (string FileName) const |
saves options to option file (*.opt) More... | |
bool | GetAllOccurrences (vector< CTokenNo > &occurrences, size_t searchPeriodNo) const |
moo: get a vector of all occurrences in a given search period; used by universal wildcard (*) queries More... | |
bool | GetOccurrencesByPosition (const string &BreakName, int anchor, vector< CTokenNo > &occurrences, size_t searchPeriodNo) const |
moo: search named break collection for matching positions, used by anchor queries More... | |
bool | UseTabFormatForLoading () const |
Public Member Functions inherited from CStringIndexator | |
CStringIndexator () | |
~CStringIndexator () | |
bool | RegisterStringIndices (const string &IndicesStr) |
read index declarations from a string and register them More... | |
bool | RegisterIndexAliases (const string &IndexAliasStr) |
read index alias declarations from a string and register them; returns true iff all registrations were successful More... | |
bool | RegisterIndexAlias (const string &AliasFrom, const string &AliasTo) |
register a single index alias (low-level); returns true iff AliasTo resolves to a known index according to m_AliasMap More... | |
void | RegisterIndexAlias (const string &AliasFrom, CStringIndexSet *idx) |
register a single index label or alias (lowest-level); if idx is NULL, any existing entry for AliasFrom will be deleted More... | |
void | SetPath (string Path) |
set the path to the indices More... | |
string | GetIndicesString () const |
return all registered index declarations, in opt-file syntax More... | |
string | GetIndexAliasString () const |
return all registered index aliases, in opt-file syntax More... | |
size_t | GetSearchPeriodsCount () const |
return the number of corpus periods More... | |
const CTokenNo & | GetSearchPeriod (size_t i) const |
get a corpus period by an index More... | |
bool | StartIndexing (string Path) |
call CreateTempFiles for all registered indices More... | |
void | TerminateIndexing () |
call DeleteTempFiles for all registered indices More... | |
bool | FinalSaveAllIndices (bool bAfterLoading) |
final saving all indices to disk (converting temp files to persistent) More... | |
bool | AddInputLoadIndexToMemoryLoadIndex () |
unites input index with memory index and clears input load index More... | |
bool | AddMemoryLoadIndexToMainLoadIndex () |
unites memory index with main index and clears memory load index More... | |
bool | SaveMemoryLoadIndex () |
store memory load index on the disk More... | |
CStringIndexSet * | GetIndexByName (const string &Name) |
return a pointer to the index by CStringIndexSet::m_Name (linear search) More... | |
CStringIndexSet * | GetIndexByNameOrShortName (const string &Name) |
return a pointer to the index by CStringIndexSet::m_Name or CStringIndexSet::m_ShortName (linear search) More... | |
CStringIndexSet * | GetIndexByAlias (const string &Alias) const |
return a pointer to the index by long-name, short-name, or alias (most abstract, uses m_IndexMap) More... | |
CStringIndexSet * | GetTokenIndex () |
return the first index that normally contains tokens themselves More... | |
const CStringIndexSet * | GetTokenIndex () const |
return the first index that normally contains tokens themselves More... | |
Public Member Functions inherited from CHitBorders | |
const CBreakCollection * | GetBreakCollectionByName (const string &Name) const |
moo: get break collection by long or short name More... | |
const vector< CBreakCollection > & | GetBreaks (void) const |
moo: get break collection map (dangerous) More... | |
CHitBorders () | |
string | GetBorderIndicesString () const |
return the string representation of break collection descriptions More... | |
string | WithinBreakName (const vector< string > &Within) const |
const ddcBreakVector * | GetBreaksByName (const string &ShortName) const |
returns a break collection by a short name More... | |
CTokenNo | GetCorpusEndTokenNo () const |
returns the value of the last file break (which should be equal to the last value of any break collection) More... | |
const ddcBreakVector & | GetFileBreaks () const |
quick reference to file breaks More... | |
CTokenNo | GetFileStartTokenNo (size_t FileNo) const |
returns the start position of corpus file FileNo More... | |
DWORD | GetPageNumber (size_t No) const |
returns m_PageBreaks[No].m_PageNumber (see CPageNumber) More... | |
bool | IsRegisteredBreak (const string &ShortName) const |
returns true if a short name is found in m_Breaks More... | |
void | RegisterBorderIndices (const char *IndicesStr) |
creates empty elements of m_Breaks by its string descriptions More... | |
bool | LoadHitBorders (string Path, bool useMMap=false) |
load break collections from the disk More... | |
void | ConvertHitsToPageBreaks (vector< CHit >::const_iterator hits_begin, vector< CHit >::const_iterator hits_end, const ddcBreakVector &Breaks, DwordVector &PageBreaks) const |
converts hits to page breaks, which contains this breaks More... | |
ddcVecFile< CPageNumber >::const_iterator | GetTokenPageBreak (CTokenNo tok) const |
get page break for a given token number as an interator into m_PageBreaks More... | |
void | AddBreakByName (const string &ShortName, const CTokenNo &B) |
adds one break to a collection identified by a short name (during indexing) More... | |
void | BordersEndIndexing (string Path) |
closes all CBreakCollectionDescr::m_FileForIndexing from m_Breaks (during indexing) More... | |
void | StartTextAreaBorders () |
must be called before indexing each text area in order to create at least on break in each text area More... | |
void | EndTextAreaBorders (DWORD TextAreaEndTokenNo) |
must be called after indexing each text area in order to create at least on break in each text area More... | |
Public Member Functions inherited from CSourceFileHolder | |
CSourceFileHolder () | |
bool | SaveSourceFileList (string FileName) |
Saves the list of source files to file *.con. More... | |
void | DeleteSourceFile (long ItemNo) |
Deletes a source file. More... | |
void | AddSourceFile (const char *FileName) |
Adds a source file. More... | |
void | DeleteAllSourceFiles () |
deletes all Source File Definition source files More... | |
size_t | GetSourceFilesCount () const |
string | GetSourceFile (size_t FileNo) const |
get the source file by the index More... | |
void | AddSourceFilesFrom (const CSourceFileHolder &X) |
void | ReadSourceFileList (string FileName) |
int | FoundNotExistedFile () const |
finds a source file which does not exist, if there is no such file, returns -1 More... | |
bool | IsModified () const |
Public Attributes | |
MorphLanguageEnum | m_Language |
the language of the corpus More... | |
bool | m_bIndexMorphPatterns |
Enables the index of morph patterns. More... | |
bool | m_bIndexChunks |
Enables indexing and querying using chunks. More... | |
bool | m_bCaseSensitive |
if true, then the default search is case sensitive More... | |
bool | m_bShowNumberOfRelevantDocuments |
if true, then DDC always calculates the number of documents, where at lease one hit is found More... | |
bool | m_bQueryOnlyFiles |
prohibits sentence break collection under DWDS_Index or MorphXML_Index More... | |
bool | m_bArchiveIndex |
sets that index should be archived under DWDS_Index or MorphXML_Index More... | |
bool | m_bResumeOnIndexErrors |
if true, CConcIndexatorInvoker skips source documents with errors More... | |
ddcCorpusList | m_CorpusFiles |
Corpus files More... | |
CMaskedFileSet | m_MaskedFiles |
CConcXml | m_BiblIndex |
a member which holds a index for bibliographical information More... | |
CHighlightTags | m_HtmlHighlighting |
highlighting tags for CConcHolder::m_ResultFormat == DDC_ResultHTML More... | |
CHighlightTags | m_TextHighlighting |
highlighting delimeters for CConcHolder::m_ResultFormat == DDC_ResultText More... | |
CHighlightTags | m_TableHighlighting |
highlighting delimeters for CConcHolder::m_ResultFormat == DDC_ResultTable More... | |
bool | m_bDisableDefaultQueryLexicalExpansion |
if true, then no default lexical expansion fo querz words occurs More... | |
int | m_LeftKwicContextSize |
the size of the left context of the highlighted words in document search More... | |
int | m_RightKwicContextSize |
the size of the right context of the highlighted words in document search More... | |
int | m_NumberOfKwicLinesInSnippets |
the maximal number of kwic lines in file snippets More... | |
double | m_TfIdfRank |
the parameter for TfIdf ranking More... | |
double | m_NearRank |
the parameter for Near ranking More... | |
double | m_PositionRank |
the parameter for Position ranking More... | |
string | m_InterpDelimiter |
delimiter to use between token index fields in output More... | |
string | m_TokenDelimiter |
delimiter to use between tokens in output More... | |
bool | m_Utf8 |
whether to assume indexed data is utf8 encoded (default=no) More... | |
bool | m_bAllowUnsafeQueries |
potentially unsafe queries will throw an exception unless this is true (default=false) More... | |
bool | m_bAllowCountByTokenAttributes |
using any token attribute as a count-key will throw an exception unless this is true (default=true) More... | |
bool | m_bLemmaQueryUsesMorphPattern |
interpret "%foo" queries using MorphPattern? (default=true) More... | |
vector< size_t > | m_IndicesToShow |
indices to show for Free_Index More... | |
time_t | m_Timestamp |
moo: timestamp of project *._con file More... | |
size_t | m_MaxCachedHitsCount |
moo: maximum number of hits in a CConcHolder cache entry – query results with more than MaxCachedHitsCount hits will not be cached (default=512) More... | |
size_t | m_MaxQueryCacheSize |
moo: maximum number of queries to be cached by an associated CConcHolder (default=512) More... | |
TxDispatcher | m_Txd |
term expansion dispatcher; should define at least an entry for "default" More... | |
map< string, string > | m_OpDefaultIndexNames |
maps token-query operators to default index names; keys are as returned by CQToken::OperatorKey() More... | |
map< string, pair< bool, string > > | m_ServerInfo |
maps symbolic keys to string constants to be included in corpus 'info' response as info.user.KEY=VAL values are pairs (isFile,Value) s.t. Value is filename iff isFile is a true value, otherwise a literal value More... | |
Public Attributes inherited from CStringIndexator | |
string | m_Path |
where all indices are stored More... | |
bool | m_bMemoryMap |
whether to directly mmap() index file data (default=false) More... | |
vector< CStringIndexSet * > | m_Indices |
the registered indices, by positional index More... | |
IndexAliasMap | m_IndexAlias |
declared index aliases (FROM -> TO); not really used at runtime More... | |
IndexMap | m_IndexMap |
all registered indices, keyed by long-name, short-name, or label (LABEL -> INDEX) More... | |
size_t | m_MaxRegExpExpansionSize |
the maximal number of index items which can be included in an expansion set of one regular expression More... | |
CStringIndexSet * | m_pChunkIndex |
a quick reference to a chunk index, if CConcIndexator::m_bIndexChunks is on, otherwise null More... | |
Protected Types | |
enum | DDCIndexTypeEnum { DWDS_Index, MorphXML_Index, Free_Index, TabFormat_Index } |
enum DDCIndexTypeEnum contains index types. Each index type determines DDC indices and break collections. More... | |
Protected Member Functions | |
void | AssertHasPath () const |
void | LoadOptionsFromString (string Options) |
loads options from a string More... | |
void | InitDefaultOptions () |
string | SaveOptionsToString () const |
saves options to a string More... | |
Protected Member Functions inherited from CStringIndexator | |
bool | RegisterChunkIndex () |
register chunk index (chunks:NP, VP etc) More... | |
string | GetSearchPeriodsFileName () const |
return the file name for search periods More... | |
bool | DestroyIndices () |
call DestroyIndexSet for all registered indices More... | |
void | ReadIndicesFromTheDisk () |
call ReadFromTheDisk for all registered indices More... | |
void | ClearStringIndices () |
clear m_Indices More... | |
void | IndexOneToken (CTokenIndexator *document, const char *Line, bool tryFixErrors=true) |
index one token and its properies (delimited by CConcCommon.h::globalFieldDelimeter) More... | |
void | IndexTokenFixLongColumns (const size_t MaxLen, const size_t nCols, const char *InputLine, char *Out) |
moo: truncate long columns in InputLine, storing result in Out More... | |
Protected Member Functions inherited from CHitBorders | |
string | GetPageBreaksFileName (string Path) const |
returns the file name for page breaks More... | |
string | GetShortNameByName (const string &BreakName) const |
returns the short name of a break collection by the long or the short name More... | |
bool | StartIndexing (string Path) |
opens for writing all CBreakCollectionDescr::m_FileForIndexing from m_Breaks More... | |
bool | RemoveHitBordersFileAndClear (string Path) |
deletes all break files More... | |
void | AddPageBreak (const CPageNumber &P) |
adds one page break More... | |
void | SavePageBreaks (const string &ProjectPath) |
save page break file More... | |
int | RegisterBreak (string ShortName, string LongName) |
int | EnsureRegisteredBreak (string ShortName, string LongName) |
int | GetBreakCollectionIndexByName (string ShortName) const |
void | AddBreakByIndex (DWORD BreakCollectionNo, const CTokenNo &B) |
Protected Attributes | |
DDCIndexTypeEnum | m_IndexType |
the type of index More... | |
bool | m_bIndexPunctuation |
Enables indexing all punctuation marks. More... | |
bool | m_bUseParagraphTagToDivide |
Enables using "<p>" tag as a paragraph delimiter. More... | |
bool | m_bEmptyLineIsSentenceDelim |
if m_bEmptyLineIsSentenceDelim is on, every empty line in the input file is considered to be the end of the sentence. More... | |
bool | m_bUseIndention |
if m_bUseIndention is on, the program tries to find paragraphs using indentions More... | |
DWORD | m_UserMaxTokenCountInOnePeriod |
The maximal number of occurrences in one subcorpora (defined by user) More... | |
DWORD | m_UserMaxInputLoadIndexSize |
The maximal number of occurrences in the input load index, by default 400000. More... | |
bool | m_bUseDwdsThesaurus |
Enables indexing and querying using DWDS Thesaurus. More... | |
Protected Attributes inherited from CStringIndexator | |
vector< CTokenNo > | m_SearchPeriods |
search periods of the corpus More... | |
Protected Attributes inherited from CHitBorders | |
vector< CBreakCollection > | m_Breaks |
all breaks More... | |
map< string, int > | m_ShortName2BreakCollection |
the map from CBreakCollection.m_ShortName to the index in m_Breaks More... | |
map< string, int > | m_LongName2BreakCollection |
the map from CBreakCollection.m_LongName to the index in m_Breaks More... | |
int | m_FileBreakCollectionNo |
a quick reference to file breaks (which are also stored in m_Breaks) More... | |
string | m_DefaultBreakName |
The name of the default break collection (written in the options file) More... | |
ddcVecFile< CPageNumber > | m_PageBreaks |
page number collection More... | |
vector< DWORD > | m_LastTextAreaBreaks |
Protected Attributes inherited from CSourceFileHolder | |
vector< string > | m_SourceFiles |
Source files More... | |
bool | m_bModifiedListOfFiles |
Private Member Functions | |
string | GetBiblIndexFileName () const |
string | GetBiblFileName () const |
bool | HasEqualOptions (const CConcordance &X) const |
checks if X has the same option More... | |
const char * | GetIndexTypeStr () const |
return a string representation of index type More... | |
bool | ReadIndexTypeFromStr (const string &s) |
read the index type from a string More... | |
void | DumpFileIndexTabs (DWORD FileNo, FILE *f) const |
dump file data to a file (tabs) More... | |
Private Attributes | |
vector< BYTE > | m_PcreCharacterTables |
a table of character properties for regular expressions which depend on CConcIndexator::m_Language More... | |
bool | m_bDwdsCorpusInterface |
if m_bDwdsCorpusInterface is on, the program outputs results in DWDS format More... | |
bool | m_bGutenbergInterface |
if m_bGutenbergInterface is on, the program outputs results in a format of Gutenberg project More... | |
bool | m_bNoContextOperator |
should we switch off context operator (#Cntxt) due copyright More... | |
bool | m_bOutputBibliographyOfHits |
Should we show bibliography of the hits instead of filename. More... | |
string | m_InternetPathPrefix |
string | m_LocalPathPrefix |
string | m_CommonFilePrefix |
Additional Inherited Members | |
Public Types inherited from CStringIndexator | |
typedef map< string, string > | IndexAliasMap |
typedef for index alias maps More... | |
typedef map< string, CStringIndexSet * > | IndexMap |
typedef for index symbol table More... | |
CConcordance is the central class of DDC technology. The most of its slots come from the two parent classes:CStringIndexator (indexing tokens and its properties) and CHitBorders (indexing corpus divisions) This class also contains a list of corpus files and some indexing and querying options.
|
protected |
enum DDCIndexTypeEnum contains index types. Each index type determines DDC indices and break collections.
Enumerator | |
---|---|
DWDS_Index | A type for corpus without annotations, which are written for each word. Fr example the input text can be a plain text. DDC always builds a token index and a file break collection for this index type. Optionally DDC can build "Thes" index, "Morph" index and a sentence collection. |
MorphXML_Index | A type for xml-texts, if their words have predefined and written annotations. DDC always builds a token index and a "MorphPattern" index. It also creates a file and a sentence break collection. |
Free_Index | This index type is free and therefore it should be defined in the options file (fields "Indices" and "HitBorders"). The corpus should consists of xml-files with a bibliographical header and a body (text). The text is written in CWB format (http://www.ims.uni-stuttgart.de/projekte/CorpusWorkbench/CWBTutorial/cwb-tutorial.pdf). The original CWB format was changed in the following way. Instead of line breaks which are used to delimit records in the input file, DDC uses a special tag CConcCommon.h::PredefinedTableLineTag. This is done because line breaks are not preserved by the XML-parser. |
TabFormat_Index |
CConcordance::CConcordance | ( | ) |
CConcordance::~CConcordance | ( | ) |
|
private |
|
private |
|
private |
checks if X has the same option
References SaveOptionsToString().
|
private |
return a string representation of index type
References DWDS_Index, Free_Index, m_IndexType, MorphXML_Index, and TabFormat_Index.
Referenced by LoadOptionsFromString(), and SaveOptionsToString().
|
private |
read the index type from a string
References DWDS_Index, Free_Index, m_IndexType, MorphXML_Index, and TabFormat_Index.
Referenced by LoadOptionsFromString().
|
private |
dump file data to a file (tabs)
References ddcVecFile< T >::begin(), ddcVecFile< T >::end(), CStringIndexSet::GetTokensFromStorage(), jsonStr(), CHitBorders::CBreakCollection::m_BreakOffsets, CIndexSetForLoadingStage::m_bUseItemStorage, CBibliography::m_DateStr, CHitBorders::CBreakCollection::m_LongName, CStringIndexSet::m_Name, CBibliography::m_OrigBibl, CBibliography::m_ScanBibl, CHitBorders::CBreakCollection::m_ShortName, CStringIndexSet::m_ShortName, CBibliography::m_StartPageInfo, ddcVecFile< T >::size(), and UnknownPageNumber.
|
protected |
References errNonePath, and ErrorMessage().
Referenced by CConcIndexator::DestroyIndex().
|
protected |
loads options from a string
References CBiblIndex::AddBiblExpander(), TxDispatcher::addExpander(), BoolToString(), ddcLogDebug, ddcLogWarn, DWDS_Index, EchoOption(), EchoOptionLines(), TxDispatcher::ensureDefaultExpanders(), Format(), Free_Index, FreeBiblAttribOptionFieldName, CHitBorders::GetBorderIndicesString(), CHitBorders::GetBreakCollectionByName(), CBiblIndex::GetFreeBibiAttributesDescr(), CStringIndexator::GetIndexAliasString(), CStringIndexator::GetIndexByAlias(), GetIndexTypeStr(), CStringIndexator::GetIndicesString(), GetIndicesToShowStr(), GetLanguageByString(), GetRegexOptions(), GetStringByLanguage(), CBiblIndex::GetTextAreasDescr(), CBiblIndex::IsRegisteredBiblField(), LC_CTYPE_UTF8_DEFAULT, LC_NUMERIC_FORCE, LoadFileToString(), m_bAllowCountByTokenAttributes, m_bAllowUnsafeQueries, m_bArchiveIndex, m_bCaseSensitive, m_bDisableDefaultQueryLexicalExpansion, m_bDwdsCorpusInterface, m_bEmptyLineIsSentenceDelim, m_bGutenbergInterface, m_BiblIndex, m_bIndexChunks, m_bIndexMorphPatterns, m_bIndexPunctuation, m_bLemmaQueryUsesMorphPattern, CStringIndexator::m_bMemoryMap, m_bNoContextOperator, m_bOutputBibliographyOfHits, m_bQueryOnlyFiles, m_bResumeOnIndexErrors, m_bShowNumberOfRelevantDocuments, m_bUseDwdsThesaurus, m_bUseIndention, m_bUseParagraphTagToDivide, CBiblIndex::m_DefaultAttrName, m_HtmlHighlighting, m_IndexType, m_IndicesToShow, m_InternetPathPrefix, m_InterpDelimiter, m_Language, m_LeftKwicContextSize, m_LocalPathPrefix, m_MaxCachedHitsCount, m_MaxQueryCacheSize, CStringIndexator::m_MaxRegExpExpansionSize, m_NearRank, m_NumberOfKwicLinesInSnippets, m_OpDefaultIndexNames, CStringIndexator::m_Path, m_PcreCharacterTables, m_PositionRank, m_RightKwicContextSize, m_ServerInfo, m_TableHighlighting, m_TextHighlighting, m_TfIdfRank, m_TokenDelimiter, m_Txd, m_UserMaxInputLoadIndexSize, m_UserMaxTokenCountInOnePeriod, m_Utf8, morphEnglish, morphGeneric, morphUnknown, MorphXML_Index, CHighlightTags::ReadFromString(), ReadIndexTypeFromStr(), CHitBorders::RegisterBorderIndices(), CStringIndexator::RegisterChunkIndex(), CBiblIndex::RegisterFreeBiblAttributes(), CStringIndexator::RegisterIndexAliases(), RegisterIndicesToShow(), CStringIndexator::RegisterStringIndices(), CBiblIndex::RegisterTextAreas(), RelativeFileName(), RmlMakeLower(), RmlPcreMakeTables(), CBiblIndex::SetRegexOptions(), StringToBool(), TabFormat_Index, TextAreaOptionFieldName, CHighlightTags::ToString(), and unescapeCString().
Referenced by CConcIndexator::CreateAsUnion(), and LoadOptionsFromFile().
|
protected |
References TxDispatcher::clear(), DefaultKwicContextSize, DefaultMaxInputLoadIndexSize, DefaultMaxTokenCountInOnePeriod, DWDS_Index, m_bAllowCountByTokenAttributes, m_bAllowUnsafeQueries, m_bArchiveIndex, m_bCaseSensitive, m_bDisableDefaultQueryLexicalExpansion, m_bDwdsCorpusInterface, m_bEmptyLineIsSentenceDelim, m_bGutenbergInterface, m_BiblIndex, m_bIndexChunks, m_bIndexMorphPatterns, m_bIndexPunctuation, m_bLemmaQueryUsesMorphPattern, CStringIndexator::m_bMemoryMap, m_bNoContextOperator, m_bOutputBibliographyOfHits, m_bQueryOnlyFiles, m_bResumeOnIndexErrors, m_bShowNumberOfRelevantDocuments, m_bUseDwdsThesaurus, m_bUseIndention, m_bUseParagraphTagToDivide, CBiblIndex::m_DefaultAttrName, CHighlightTags::m_FirstCloser, CHighlightTags::m_FirstOpener, m_HtmlHighlighting, m_IndexType, m_InterpDelimiter, m_Language, m_LeftKwicContextSize, m_MaxCachedHitsCount, m_MaxQueryCacheSize, m_NearRank, m_NumberOfKwicLinesInSnippets, m_OpDefaultIndexNames, CStringIndexator::m_Path, m_PositionRank, CHighlightTags::m_RestCloser, CHighlightTags::m_RestOpener, m_RightKwicContextSize, m_ServerInfo, m_TableHighlighting, m_TextHighlighting, m_TfIdfRank, m_Timestamp, m_TokenDelimiter, m_Txd, m_UserMaxInputLoadIndexSize, m_UserMaxTokenCountInOnePeriod, m_Utf8, and morphUnknown.
Referenced by CConcIndexator::CConcIndexator().
|
protected |
saves options to a string
References TxDispatcher::configString(), DefaultKwicContextSize, Format(), CHitBorders::GetBorderIndicesString(), CBiblIndex::GetFreeBibiAttributesDescr(), CStringIndexator::GetIndexAliasString(), GetIndexTypeStr(), CStringIndexator::GetIndicesString(), GetIndicesToShowStr(), GetStringByLanguage(), CBiblIndex::GetTextAreasDescr(), m_bAllowCountByTokenAttributes, m_bAllowUnsafeQueries, m_bArchiveIndex, m_bCaseSensitive, m_bDisableDefaultQueryLexicalExpansion, m_bDwdsCorpusInterface, m_bEmptyLineIsSentenceDelim, m_bGutenbergInterface, m_BiblIndex, m_bIndexChunks, m_bIndexMorphPatterns, m_bIndexPunctuation, CStringIndexator::m_bMemoryMap, m_bNoContextOperator, m_bOutputBibliographyOfHits, m_bQueryOnlyFiles, m_bResumeOnIndexErrors, m_bShowNumberOfRelevantDocuments, m_bUseDwdsThesaurus, m_bUseIndention, m_bUseParagraphTagToDivide, CHighlightTags::m_bWasReadFromString, CBiblIndex::m_DefaultAttrName, m_HtmlHighlighting, m_InternetPathPrefix, m_InterpDelimiter, m_Language, m_LeftKwicContextSize, m_LocalPathPrefix, m_MaxCachedHitsCount, m_MaxQueryCacheSize, CStringIndexator::m_MaxRegExpExpansionSize, m_NearRank, m_NumberOfKwicLinesInSnippets, m_OpDefaultIndexNames, m_PositionRank, m_RightKwicContextSize, m_TableHighlighting, m_TextHighlighting, m_TfIdfRank, m_Txd, m_UserMaxInputLoadIndexSize, m_UserMaxTokenCountInOnePeriod, m_Utf8, morphUnknown, and CHighlightTags::ToString().
Referenced by CConcIndexator::CreateAsUnion(), HasEqualOptions(), SaveOptions(), and CConcIndexator::SplitProject().
void CConcordance::RegisterIndicesToShow | ( | const string & | IndexListStr | ) |
wrapper for IndicesToShow option; now accepts integer position (min=1), long name, short name, or alias
References Format(), and StringTokenizer::val().
Referenced by LoadOptionsFromString().
string CConcordance::GetIndicesToShowStr | ( | bool | Positional = false | ) | const |
get opt-style string declaration for IndicesToShow (empty if default)
References Format(), and CStringIndexSet::m_Name.
Referenced by LoadOptionsFromString(), and SaveOptionsToString().
|
inline |
return default pcre regex options
Referenced by CQueryTokenNode::BuildRegExp(), CQueryOptions::CheckSatisfiable(), and LoadOptionsFromString().
|
inline |
return true, if DDC outputs results in DWDS format
|
inline |
return true, if DDC outputs results in Gutenberg project format
|
inline |
return true, if query context operator (#Cntxt) is switched off
Referenced by CDDCLeafServer::handle__info().
|
inline |
return true, if DWDS thesaurus is enabled (index "Thes")
Referenced by CDDCLeafServer::handle__info(), and CConcIndexator::StartIndexing().
|
inline |
return true, if DDC should output bibliographical information for hits instead of corpus file names
|
inline |
wrapper for m_bIndexPunctuation
Referenced by CDDCLeafServer::handle__info().
string CConcordance::GetHtmlReference | ( | size_t | posFile | ) | const |
get an HTML formatted reference to a corpus file
References Format().
string CConcordance::GetShortFilename | ( | size_t | posFile | ) | const |
get a reference to a corpus file without the common left prefix
string CConcordance::GetFileNameForCorpusFileNames | ( | ) | const |
get file name for storing corpus file names
References MakeFName().
Referenced by CConcIndexator::CreateAsUnion(), CConcIndexator::DestroyIndex(), ConcIndexatorInvoker::FinalizeIndex(), CConcIndexator::SaveCorpusFileList(), and CConcIndexator::SplitProject().
string CConcordance::GetFileNameForMaskedFiles | ( | ) | const |
get file name for masked files (strings)
References MakeFName().
Referenced by CConcIndexator::DestroyIndex().
string CConcordance::GetFileNameForMaskedFileIds | ( | ) | const |
get file name for masked files (IDs, binary)
References MakeFName().
Referenced by CConcIndexator::CreateAsUnion(), CConcIndexator::DestroyIndex(), and CConcIndexator::SaveMaskedFileIds().
vector< string > CConcordance::GetTokenFields | ( | const COutputToken & | tok | ) |
parse a delimited token into fields by splitting on m_InterpDelimiter
References COutputToken::m_InterpStr, and COutputToken::m_TokenStr.
|
inline |
get the number of indexed corpus files
References ddcCorpusList< OffT_ >::size().
Referenced by CQCount::CountUniversal(), CDDCLeafServer::handle__info(), and CConcIndexator::SplitProject().
|
inline |
get corpus file by index
|
inline |
get count of masked files
References idfJson.
Referenced by CDDCLeafServer::handle__info().
void CConcordance::LoadSourceFilesAndOptions | ( | string | FileName, |
bool | reallyReadSourceFiles = true |
||
) |
load list of source files and parses option file (*.opt)
References ddcLogWarn, Format(), LoadFileToString(), MakeFName(), and CSourceFileHolder::ReadSourceFileList().
Referenced by ConcIndexatorInvoker::BuildIndexStart().
void CConcordance::LoadCorpusFiles | ( | ) |
load list of corpus files (*.con)
References ddc_format_version_check(), ddcLogWarn, FileExists(), LoadFileToString(), and Trim().
string CConcordance::GetCommonFilePrefix | ( | ) | const |
(re-)compute common prefix of corpus files in m_CorpusFiles
Referenced by CConcIndexator::SaveCorpusFileList().
void CConcordance::LoadMaskedFiles | ( | ) |
load list of masked (deleted)Corpus File Definition
References ddcVecFile< T >::begin(), ddcLogDebug, ddcLogWarn, ddcVecFile< T >::end(), FileExists(), Format(), ddcCorpusList< OffT_ >::NO_ID, ddcVecFile< T >::open(), ddcVecFile< T >::size(), and Trim().
void CConcordance::LoadOptionsFromFile | ( | const string & | OptFile | ) |
loads options from a named file (calls LoadOptionsFromString())
References Format(), LoadFileToString(), and LoadOptionsFromString().
void CConcordance::LoadProject | ( | string | FileName, |
bool | includeSourceFiles = true |
||
) |
loads everything
References Format().
Referenced by ConcIndexatorInvoker::BuildOnlyMorphIndex().
time_t CConcordance::UpdateTimestamp | ( | const char * | filename | ) |
update m_Timestamp to mtime of Filename if it is newer than the current m_Timestamp;
References FileMTime().
void CConcordance::DumpBibliography | ( | FILE * | f = stdout | ) | const |
dump loaded metadata to a stream (JSON)
string CConcordance::DumpFileBibliography | ( | DWORD | FileNo | ) | const |
dump file metadata to a string (JSON)
References Format(), jsonStr(), CBibliography::m_DateStr, CBibliography::m_OrigBibl, CBibliography::m_ScanBibl, CBibliography::m_StartPageInfo, and UnknownPageNumber.
void CConcordance::DumpFileIndexJson | ( | DWORD | FileNo, |
FILE * | f = stdout |
||
) | const |
dump file data to a file (JSON)
References ddcVecFile< T >::end(), jsonStr(), and CPageNumber::m_StartTokenNo.
void CConcordance::DumpIndex | ( | string | dirname, |
CIndexDumpFormat | fmt = idfJson |
||
) | const |
dump index data to a directory (format-dependent)
References errRuntime, Format(), idfJson, idfTabs, and StrError().
void CConcordance::DumpIndexToSingleTabFile | ( | FILE * | outfp | ) | const |
dump index data to a single tab formatted file
bool CConcordance::SaveOptions | ( | string | FileName | ) | const |
saves options to option file (*.opt)
References MakeFName(), and SaveOptionsToString().
Referenced by CConcIndexator::CreateAsUnion().
bool CConcordance::GetAllOccurrences | ( | vector< CTokenNo > & | occurrences, |
size_t | searchPeriodNo | ||
) | const |
moo: get a vector of all occurrences in a given search period; used by universal wildcard (*) queries
Referenced by CQueryTokenNode::EvaluateWithoutHits().
bool CConcordance::GetOccurrencesByPosition | ( | const string & | BreakName, |
int | anchor, | ||
vector< CTokenNo > & | occurrences, | ||
size_t | searchPeriodNo | ||
) | const |
moo: search named break collection for matching positions, used by anchor queries
References ddcVecFile< T >::begin(), ddcVecFile< T >::end(), errRuntime, and CHitBorders::CBreakCollection::m_BreakOffsets.
Referenced by CQueryTokenNode::EvaluateWithoutHits().
|
inline |
|
private |
a table of character properties for regular expressions which depend on CConcIndexator::m_Language
Referenced by LoadOptionsFromString().
|
private |
if m_bDwdsCorpusInterface is on, the program outputs results in DWDS format
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
|
private |
if m_bGutenbergInterface is on, the program outputs results in a format of Gutenberg project
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
|
private |
should we switch off context operator (#Cntxt) due copyright
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
|
private |
Should we show bibliography of the hits instead of filename.
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
|
private |
Referenced by LoadOptionsFromString(), and SaveOptionsToString().
|
private |
Referenced by LoadOptionsFromString(), and SaveOptionsToString().
|
private |
|
protected |
the type of index
Referenced by GetIndexTypeStr(), CConcIndexator::IndexOneFile(), InitDefaultOptions(), LoadOptionsFromString(), and ReadIndexTypeFromStr().
|
protected |
Enables indexing all punctuation marks.
Referenced by InitDefaultOptions(), CConcIndexator::IsDWDSToken(), LoadOptionsFromString(), and SaveOptionsToString().
|
protected |
Enables using "<p>" tag as a paragraph delimiter.
Referenced by InitDefaultOptions(), CConcIndexator::InitGraphan(), LoadOptionsFromString(), and SaveOptionsToString().
|
protected |
if m_bEmptyLineIsSentenceDelim is on, every empty line in the input file is considered to be the end of the sentence.
Referenced by InitDefaultOptions(), CConcIndexator::InitGraphan(), LoadOptionsFromString(), and SaveOptionsToString().
|
protected |
if m_bUseIndention is on, the program tries to find paragraphs using indentions
Referenced by InitDefaultOptions(), CConcIndexator::InitGraphan(), LoadOptionsFromString(), and SaveOptionsToString().
|
protected |
The maximal number of occurrences in one subcorpora (defined by user)
Referenced by CConcIndexator::GetMaxTokenCountInOnePeriod(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
|
protected |
The maximal number of occurrences in the input load index, by default 400000.
Referenced by CConcIndexator::GetMaxInputLoadIndexSize(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
|
protected |
Enables indexing and querying using DWDS Thesaurus.
Referenced by CConcIndexator::IndexTextOrHtmlFile(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
MorphLanguageEnum CConcordance::m_Language |
the language of the corpus
Referenced by CQueryTokenNode::CreateLemmaPattern(), CConcIndexator::CreateMorphIndexWrapper(), CQueryTokenNode::CreateThesPattern(), CDDCLeafServer::handle__info(), InitDefaultOptions(), CConcIndexator::InitGraphan(), LoadOptionsFromString(), and SaveOptionsToString().
bool CConcordance::m_bIndexMorphPatterns |
Enables the index of morph patterns.
Referenced by ConcIndexatorInvoker::FinalizeIndex(), CDDCLeafServer::handle__info(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
bool CConcordance::m_bIndexChunks |
Enables indexing and querying using chunks.
Referenced by CDDCLeafServer::handle__info(), CConcIndexator::IndexOneTableTextArea(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
bool CConcordance::m_bCaseSensitive |
if true, then the default search is case sensitive
Referenced by CDDCLeafServer::handle__info(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
bool CConcordance::m_bShowNumberOfRelevantDocuments |
if true, then DDC always calculates the number of documents, where at lease one hit is found
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
bool CConcordance::m_bQueryOnlyFiles |
prohibits sentence break collection under DWDS_Index or MorphXML_Index
Referenced by CConcIndexator::IndexTextOrHtmlFile(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
bool CConcordance::m_bArchiveIndex |
sets that index should be archived under DWDS_Index or MorphXML_Index
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
bool CConcordance::m_bResumeOnIndexErrors |
if true, CConcIndexatorInvoker skips source documents with errors
Referenced by ConcIndexatorInvoker::FinalizeIndex(), ConcIndexatorInvoker::IndexFile(), CConcIndexator::IndexMorphXml(), CConcIndexator::IndexOneTableTextArea(), CConcIndexator::IndexTextOrHtmlFile(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
ddcCorpusList CConcordance::m_CorpusFiles |
CMaskedFileSet CConcordance::m_MaskedFiles |
CConcXml CConcordance::m_BiblIndex |
a member which holds a index for bibliographical information
Referenced by CQueryOptions::Compile(), CQFBiblSort::Compile(), CQFHasFieldValue::Compile(), CQFHasFieldRegex::Compile(), CQFHasFieldSet::Compile(), CQueryNode::ConvertOccurrencesToHits(), CQueryNode::ConvertOccurrencesToHitsForPatterns(), CQCount::CountUniversal(), CConcIndexator::CreateAsUnion(), CQFSort::GetBiblConstant(), CDDCLeafServer::handle__info(), CConcIndexator::IndexFreeIndex(), CConcIndexator::IndexMorphXml(), CConcIndexator::IndexTextOrHtmlFile(), InitDefaultOptions(), CConcIndexator::LoadFileIntoGraphan(), LoadOptionsFromString(), CConcIndexator::LoadXmlFile(), CConcIndexator::NormalEndIndexing(), CQFSort::ResolveAttributeName(), SaveOptionsToString(), CConcIndexator::SplitProject(), CConcIndexator::StartIndexing(), and CConcIndexator::TerminateIndexing().
CHighlightTags CConcordance::m_HtmlHighlighting |
highlighting tags for CConcHolder::m_ResultFormat == DDC_ResultHTML
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
CHighlightTags CConcordance::m_TextHighlighting |
highlighting delimeters for CConcHolder::m_ResultFormat == DDC_ResultText
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
CHighlightTags CConcordance::m_TableHighlighting |
highlighting delimeters for CConcHolder::m_ResultFormat == DDC_ResultTable
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
bool CConcordance::m_bDisableDefaultQueryLexicalExpansion |
if true, then no default lexical expansion fo querz words occurs
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
int CConcordance::m_LeftKwicContextSize |
the size of the left context of the highlighted words in document search
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
int CConcordance::m_RightKwicContextSize |
the size of the right context of the highlighted words in document search
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
int CConcordance::m_NumberOfKwicLinesInSnippets |
the maximal number of kwic lines in file snippets
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
double CConcordance::m_TfIdfRank |
the parameter for TfIdf ranking
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
double CConcordance::m_NearRank |
the parameter for Near ranking
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
double CConcordance::m_PositionRank |
the parameter for Position ranking
Referenced by InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
string CConcordance::m_InterpDelimiter |
delimiter to use between token index fields in output
Referenced by CDDCLeafServer::handle__info(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
string CConcordance::m_TokenDelimiter |
delimiter to use between tokens in output
Referenced by CDDCLeafServer::handle__info(), InitDefaultOptions(), and LoadOptionsFromString().
bool CConcordance::m_Utf8 |
whether to assume indexed data is utf8 encoded (default=no)
Referenced by CDDCLeafServer::handle__info(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
bool CConcordance::m_bAllowUnsafeQueries |
potentially unsafe queries will throw an exception unless this is true (default=false)
Referenced by CQTokFile::Create(), CDDCLeafServer::handle__info(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
bool CConcordance::m_bAllowCountByTokenAttributes |
using any token attribute as a count-key will throw an exception unless this is true (default=true)
Referenced by CQCountKeyExprToken::Compile(), CDDCLeafServer::handle__info(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
bool CConcordance::m_bLemmaQueryUsesMorphPattern |
interpret "%foo" queries using MorphPattern? (default=true)
Referenced by CQTokLemma::Create(), InitDefaultOptions(), and LoadOptionsFromString().
vector<size_t> CConcordance::m_IndicesToShow |
indices to show for Free_Index
Referenced by CDDCLeafServer::handle__info(), and LoadOptionsFromString().
time_t CConcordance::m_Timestamp |
moo: timestamp of project *._con file
Referenced by CDDCLeafServer::handle__info(), and InitDefaultOptions().
size_t CConcordance::m_MaxCachedHitsCount |
moo: maximum number of hits in a CConcHolder cache entry – query results with more than MaxCachedHitsCount hits will not be cached (default=512)
Referenced by CConcSessionContext::CacheGet(), CConcSessionContext::CacheSet(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
size_t CConcordance::m_MaxQueryCacheSize |
moo: maximum number of queries to be cached by an associated CConcHolder (default=512)
Referenced by CConcSessionContext::CacheSet(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
TxDispatcher CConcordance::m_Txd |
term expansion dispatcher; should define at least an entry for "default"
Referenced by CConcSession::GetTxDispatcher(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
map<string, string> CConcordance::m_OpDefaultIndexNames |
maps token-query operators to default index names; keys are as returned by CQToken::OperatorKey()
Referenced by CQToken::BreakName(), CDDCLeafServer::handle__info(), CQToken::IndexName(), InitDefaultOptions(), LoadOptionsFromString(), and SaveOptionsToString().
map<string, pair<bool, string> > CConcordance::m_ServerInfo |
maps symbolic keys to string constants to be included in corpus 'info' response as info.user.KEY=VAL values are pairs (isFile,Value) s.t. Value is filename iff isFile is a true value, otherwise a literal value
Referenced by CDDCLeafServer::handle__info(), InitDefaultOptions(), and LoadOptionsFromString().