|
typedef map< string, string > | IndexAliasMap |
| typedef for index alias maps More...
|
|
typedef map< string, CStringIndexSet * > | IndexMap |
| typedef for index symbol table More...
|
|
| CConcordance () |
|
| ~CConcordance () |
|
void | RegisterIndicesToShow (const string &IndexListStr) |
| wrapper for IndicesToShow option; now accepts integer position (min=1), long name, short name, or alias More...
|
|
string | GetIndicesToShowStr (bool Positional=false) const |
| get opt-style string declaration for IndicesToShow (empty if default) More...
|
|
RML_RE::Options | GetRegexOptions () const |
| return default pcre regex options More...
|
|
bool | IsDwdsCorpusInterface () const |
| return true, if DDC outputs results in DWDS format More...
|
|
bool | IsGutenbergInterface () const |
| return true, if DDC outputs results in Gutenberg project format More...
|
|
bool | HasContextOperator () const |
| return true, if query context operator (#Cntxt) is switched off More...
|
|
bool | UseDwdsThesaurus () const |
| return true, if DWDS thesaurus is enabled (index "Thes") More...
|
|
bool | OutputBibliographyOfHits () const |
| return true, if DDC should output bibliographical information for hits instead of corpus file names More...
|
|
bool | IndexPunctuation () const |
| wrapper for m_bIndexPunctuation More...
|
|
string | GetHtmlReference (size_t posFile) const |
| get an HTML formatted reference to a corpus file More...
|
|
string | GetShortFilename (size_t posFile) const |
| get a reference to a corpus file without the common left prefix More...
|
|
string | GetFileNameForCorpusFileNames () const |
| get file name for storing corpus file names More...
|
|
string | GetFileNameForMaskedFiles () const |
| get file name for masked files (strings) More...
|
|
string | GetFileNameForMaskedFileIds () const |
| get file name for masked files (IDs, binary) More...
|
|
vector< string > | GetTokenFields (const COutputToken &tok) |
| parse a delimited token into fields by splitting on m_InterpDelimiter More...
|
|
size_t | GetCorpusFilesCount () const |
| get the number of indexed corpus files More...
|
|
string | GetCorpusFile (CFileNo FileNo) const |
| get corpus file by index More...
|
|
size_t | GetMaskedFilesCount () const |
| get count of masked files More...
|
|
void | LoadSourceFilesAndOptions (string FileName, bool reallyReadSourceFiles=true) |
| load list of source files and parses option file (*.opt) More...
|
|
void | LoadCorpusFiles () |
| load list of corpus files (*.con) More...
|
|
string | GetCommonFilePrefix () const |
| (re-)compute common prefix of corpus files in m_CorpusFiles More...
|
|
void | LoadMaskedFiles () |
| load list of masked (deleted)Corpus File Definition More...
|
|
void | LoadOptionsFromFile (const string &OptFile) |
| loads options from a named file (calls LoadOptionsFromString()) More...
|
|
void | LoadProject (string FileName, bool includeSourceFiles=true) |
| loads everything More...
|
|
time_t | UpdateTimestamp (const char *filename) |
| update m_Timestamp to mtime of Filename if it is newer than the current m_Timestamp; More...
|
|
void | DumpBibliography (FILE *f=stdout) const |
| dump loaded metadata to a stream (JSON) More...
|
|
string | DumpFileBibliography (DWORD FileNo) const |
| dump file metadata to a string (JSON) More...
|
|
void | DumpFileIndexJson (DWORD FileNo, FILE *f=stdout) const |
| dump file data to a file (JSON) More...
|
|
void | DumpIndex (string dirname, CIndexDumpFormat fmt=idfJson) const |
| dump index data to a directory (format-dependent) More...
|
|
void | DumpIndexToSingleTabFile (FILE *outfp) const |
| dump index data to a single tab formatted file More...
|
|
bool | SaveOptions (string FileName) const |
| saves options to option file (*.opt) More...
|
|
bool | GetAllOccurrences (vector< CTokenNo > &occurrences, size_t searchPeriodNo) const |
| moo: get a vector of all occurrences in a given search period; used by universal wildcard (*) queries More...
|
|
bool | GetOccurrencesByPosition (const string &BreakName, int anchor, vector< CTokenNo > &occurrences, size_t searchPeriodNo) const |
| moo: search named break collection for matching positions, used by anchor queries More...
|
|
bool | UseTabFormatForLoading () const |
|
| CStringIndexator () |
|
| ~CStringIndexator () |
|
bool | RegisterStringIndices (const string &IndicesStr) |
| read index declarations from a string and register them More...
|
|
bool | RegisterIndexAliases (const string &IndexAliasStr) |
| read index alias declarations from a string and register them; returns true iff all registrations were successful More...
|
|
bool | RegisterIndexAlias (const string &AliasFrom, const string &AliasTo) |
| register a single index alias (low-level); returns true iff AliasTo resolves to a known index according to m_AliasMap More...
|
|
void | RegisterIndexAlias (const string &AliasFrom, CStringIndexSet *idx) |
| register a single index label or alias (lowest-level); if idx is NULL, any existing entry for AliasFrom will be deleted More...
|
|
void | SetPath (string Path) |
| set the path to the indices More...
|
|
string | GetIndicesString () const |
| return all registered index declarations, in opt-file syntax More...
|
|
string | GetIndexAliasString () const |
| return all registered index aliases, in opt-file syntax More...
|
|
size_t | GetSearchPeriodsCount () const |
| return the number of corpus periods More...
|
|
const CTokenNo & | GetSearchPeriod (size_t i) const |
| get a corpus period by an index More...
|
|
bool | StartIndexing (string Path) |
| call CreateTempFiles for all registered indices More...
|
|
void | TerminateIndexing () |
| call DeleteTempFiles for all registered indices More...
|
|
bool | FinalSaveAllIndices (bool bAfterLoading) |
| final saving all indices to disk (converting temp files to persistent) More...
|
|
bool | AddInputLoadIndexToMemoryLoadIndex () |
| unites input index with memory index and clears input load index More...
|
|
bool | AddMemoryLoadIndexToMainLoadIndex () |
| unites memory index with main index and clears memory load index More...
|
|
bool | SaveMemoryLoadIndex () |
| store memory load index on the disk More...
|
|
CStringIndexSet * | GetIndexByName (const string &Name) |
| return a pointer to the index by CStringIndexSet::m_Name (linear search) More...
|
|
CStringIndexSet * | GetIndexByNameOrShortName (const string &Name) |
| return a pointer to the index by CStringIndexSet::m_Name or CStringIndexSet::m_ShortName (linear search) More...
|
|
CStringIndexSet * | GetIndexByAlias (const string &Alias) const |
| return a pointer to the index by long-name, short-name, or alias (most abstract, uses m_IndexMap) More...
|
|
CStringIndexSet * | GetTokenIndex () |
| return the first index that normally contains tokens themselves More...
|
|
const CStringIndexSet * | GetTokenIndex () const |
| return the first index that normally contains tokens themselves More...
|
|
const CBreakCollection * | GetBreakCollectionByName (const string &Name) const |
| moo: get break collection by long or short name More...
|
|
const vector< CBreakCollection > & | GetBreaks (void) const |
| moo: get break collection map (dangerous) More...
|
|
| CHitBorders () |
|
string | GetBorderIndicesString () const |
| return the string representation of break collection descriptions More...
|
|
string | WithinBreakName (const vector< string > &Within) const |
|
const ddcBreakVector * | GetBreaksByName (const string &ShortName) const |
| returns a break collection by a short name More...
|
|
CTokenNo | GetCorpusEndTokenNo () const |
| returns the value of the last file break (which should be equal to the last value of any break collection) More...
|
|
const ddcBreakVector & | GetFileBreaks () const |
| quick reference to file breaks More...
|
|
CTokenNo | GetFileStartTokenNo (size_t FileNo) const |
| returns the start position of corpus file FileNo More...
|
|
DWORD | GetPageNumber (size_t No) const |
| returns m_PageBreaks[No].m_PageNumber (see CPageNumber) More...
|
|
bool | IsRegisteredBreak (const string &ShortName) const |
| returns true if a short name is found in m_Breaks More...
|
|
void | RegisterBorderIndices (const char *IndicesStr) |
| creates empty elements of m_Breaks by its string descriptions More...
|
|
bool | LoadHitBorders (string Path, bool useMMap=false) |
| load break collections from the disk More...
|
|
void | ConvertHitsToPageBreaks (vector< CHit >::const_iterator hits_begin, vector< CHit >::const_iterator hits_end, const ddcBreakVector &Breaks, DwordVector &PageBreaks) const |
| converts hits to page breaks, which contains this breaks More...
|
|
ddcVecFile< CPageNumber >::const_iterator | GetTokenPageBreak (CTokenNo tok) const |
| get page break for a given token number as an interator into m_PageBreaks More...
|
|
void | AddBreakByName (const string &ShortName, const CTokenNo &B) |
| adds one break to a collection identified by a short name (during indexing) More...
|
|
void | BordersEndIndexing (string Path) |
| closes all CBreakCollectionDescr::m_FileForIndexing from m_Breaks (during indexing) More...
|
|
void | StartTextAreaBorders () |
| must be called before indexing each text area in order to create at least on break in each text area More...
|
|
void | EndTextAreaBorders (DWORD TextAreaEndTokenNo) |
| must be called after indexing each text area in order to create at least on break in each text area More...
|
|
| CSourceFileHolder () |
|
bool | SaveSourceFileList (string FileName) |
| Saves the list of source files to file *.con. More...
|
|
void | DeleteSourceFile (long ItemNo) |
| Deletes a source file. More...
|
|
void | AddSourceFile (const char *FileName) |
| Adds a source file. More...
|
|
void | DeleteAllSourceFiles () |
| deletes all Source File Definition source files More...
|
|
size_t | GetSourceFilesCount () const |
|
string | GetSourceFile (size_t FileNo) const |
| get the source file by the index More...
|
|
void | AddSourceFilesFrom (const CSourceFileHolder &X) |
|
void | ReadSourceFileList (string FileName) |
|
int | FoundNotExistedFile () const |
| finds a source file which does not exist, if there is no such file, returns -1 More...
|
|
bool | IsModified () const |
|
MorphLanguageEnum | m_Language |
| the language of the corpus More...
|
|
bool | m_bIndexMorphPatterns |
| Enables the index of morph patterns. More...
|
|
bool | m_bIndexChunks |
| Enables indexing and querying using chunks. More...
|
|
bool | m_bCaseSensitive |
| if true, then the default search is case sensitive More...
|
|
bool | m_bShowNumberOfRelevantDocuments |
| if true, then DDC always calculates the number of documents, where at lease one hit is found More...
|
|
bool | m_bQueryOnlyFiles |
| prohibits sentence break collection under DWDS_Index or MorphXML_Index More...
|
|
bool | m_bArchiveIndex |
| sets that index should be archived under DWDS_Index or MorphXML_Index More...
|
|
bool | m_bResumeOnIndexErrors |
| if true, CConcIndexatorInvoker skips source documents with errors More...
|
|
ddcCorpusList | m_CorpusFiles |
| Corpus files More...
|
|
CMaskedFileSet | m_MaskedFiles |
|
CConcXml | m_BiblIndex |
| a member which holds a index for bibliographical information More...
|
|
CHighlightTags | m_HtmlHighlighting |
| highlighting tags for CConcHolder::m_ResultFormat == DDC_ResultHTML More...
|
|
CHighlightTags | m_TextHighlighting |
| highlighting delimeters for CConcHolder::m_ResultFormat == DDC_ResultText More...
|
|
CHighlightTags | m_TableHighlighting |
| highlighting delimeters for CConcHolder::m_ResultFormat == DDC_ResultTable More...
|
|
bool | m_bDisableDefaultQueryLexicalExpansion |
| if true, then no default lexical expansion fo querz words occurs More...
|
|
int | m_LeftKwicContextSize |
| the size of the left context of the highlighted words in document search More...
|
|
int | m_RightKwicContextSize |
| the size of the right context of the highlighted words in document search More...
|
|
int | m_NumberOfKwicLinesInSnippets |
| the maximal number of kwic lines in file snippets More...
|
|
double | m_TfIdfRank |
| the parameter for TfIdf ranking More...
|
|
double | m_NearRank |
| the parameter for Near ranking More...
|
|
double | m_PositionRank |
| the parameter for Position ranking More...
|
|
string | m_InterpDelimiter |
| delimiter to use between token index fields in output More...
|
|
string | m_TokenDelimiter |
| delimiter to use between tokens in output More...
|
|
bool | m_Utf8 |
| whether to assume indexed data is utf8 encoded (default=no) More...
|
|
bool | m_bAllowUnsafeQueries |
| potentially unsafe queries will throw an exception unless this is true (default=false) More...
|
|
bool | m_bAllowCountByTokenAttributes |
| using any token attribute as a count-key will throw an exception unless this is true (default=true) More...
|
|
bool | m_bLemmaQueryUsesMorphPattern |
| interpret "%foo" queries using MorphPattern? (default=true) More...
|
|
vector< size_t > | m_IndicesToShow |
| indices to show for Free_Index More...
|
|
time_t | m_Timestamp |
| moo: timestamp of project *._con file More...
|
|
size_t | m_MaxCachedHitsCount |
| moo: maximum number of hits in a CConcHolder cache entry – query results with more than MaxCachedHitsCount hits will not be cached (default=512) More...
|
|
size_t | m_MaxQueryCacheSize |
| moo: maximum number of queries to be cached by an associated CConcHolder (default=512) More...
|
|
TxDispatcher | m_Txd |
| term expansion dispatcher; should define at least an entry for "default" More...
|
|
map< string, string > | m_OpDefaultIndexNames |
| maps token-query operators to default index names; keys are as returned by CQToken::OperatorKey() More...
|
|
map< string, pair< bool, string > > | m_ServerInfo |
| maps symbolic keys to string constants to be included in corpus 'info' response as info.user.KEY=VAL values are pairs (isFile,Value) s.t. Value is filename iff isFile is a true value, otherwise a literal value More...
|
|
string | m_Path |
| where all indices are stored More...
|
|
bool | m_bMemoryMap |
| whether to directly mmap() index file data (default=false) More...
|
|
vector< CStringIndexSet * > | m_Indices |
| the registered indices, by positional index More...
|
|
IndexAliasMap | m_IndexAlias |
| declared index aliases (FROM -> TO); not really used at runtime More...
|
|
IndexMap | m_IndexMap |
| all registered indices, keyed by long-name, short-name, or label (LABEL -> INDEX) More...
|
|
size_t | m_MaxRegExpExpansionSize |
| the maximal number of index items which can be included in an expansion set of one regular expression More...
|
|
CStringIndexSet * | m_pChunkIndex |
| a quick reference to a chunk index, if CConcIndexator::m_bIndexChunks is on, otherwise null More...
|
|
enum | DDCIndexTypeEnum { DWDS_Index,
MorphXML_Index,
Free_Index,
TabFormat_Index
} |
| enum DDCIndexTypeEnum contains index types. Each index type determines DDC indices and break collections. More...
|
|
DDCIndexTypeEnum | m_IndexType |
| the type of index More...
|
|
bool | m_bIndexPunctuation |
| Enables indexing all punctuation marks. More...
|
|
bool | m_bUseParagraphTagToDivide |
| Enables using "<p>" tag as a paragraph delimiter. More...
|
|
bool | m_bEmptyLineIsSentenceDelim |
| if m_bEmptyLineIsSentenceDelim is on, every empty line in the input file is considered to be the end of the sentence. More...
|
|
bool | m_bUseIndention |
| if m_bUseIndention is on, the program tries to find paragraphs using indentions More...
|
|
DWORD | m_UserMaxTokenCountInOnePeriod |
| The maximal number of occurrences in one subcorpora (defined by user) More...
|
|
DWORD | m_UserMaxInputLoadIndexSize |
| The maximal number of occurrences in the input load index, by default 400000. More...
|
|
bool | m_bUseDwdsThesaurus |
| Enables indexing and querying using DWDS Thesaurus. More...
|
|
vector< CTokenNo > | m_SearchPeriods |
| search periods of the corpus More...
|
|
vector< CBreakCollection > | m_Breaks |
| all breaks More...
|
|
map< string, int > | m_ShortName2BreakCollection |
| the map from CBreakCollection.m_ShortName to the index in m_Breaks More...
|
|
map< string, int > | m_LongName2BreakCollection |
| the map from CBreakCollection.m_LongName to the index in m_Breaks More...
|
|
int | m_FileBreakCollectionNo |
| a quick reference to file breaks (which are also stored in m_Breaks) More...
|
|
string | m_DefaultBreakName |
| The name of the default break collection (written in the options file) More...
|
|
ddcVecFile< CPageNumber > | m_PageBreaks |
| page number collection More...
|
|
vector< DWORD > | m_LastTextAreaBreaks |
|
vector< string > | m_SourceFiles |
| Source files More...
|
|
bool | m_bModifiedListOfFiles |
|