ddc
|
#include <ConcIndexator.h>
Public Member Functions | |
CConcIndexator () | |
~CConcIndexator () | |
void | InitGraphan () |
initializes graphematics using current options More... | |
bool | SaveCorpusFileList () const |
saves corpus file list (*._con) More... | |
bool | SaveMaskedFileIds () const |
saves masked file-ids (*._masked_ids) More... | |
void | StartIndexing () |
begins indexing More... | |
void | DestroyIndex () |
destroy all index files More... | |
void | NormalEndIndexing () |
finishes indexing (normal way) More... | |
void | TerminateIndexing () |
terminates indexing (for exceptions) More... | |
void | IndexOneFile (CIndexDocument *document) |
index one file according to m_IndexType More... | |
void | RollbackIndexOneFile (CTokenNo startTrimTokenNo) |
rollback data buffered by an immediate preceding failed IndexOneFile() More... | |
void | CalculateSearchPeriods (DWORD MaxTokenCountInOnePeriod) |
finds all subcorpora More... | |
bool | CreateAsUnion (vector< CConcIndexator * > &X, bool inheritOptions=true) |
creates new concordance as union of one or more concordances (new) More... | |
bool | CreateAsUnion (const vector< string > &Xfiles, bool inheritOptions=true) |
creates new concordance as union of one or more concordances (new, filename-based) More... | |
bool | SplitProject (vector< CConcIndexator *> &Subs) const |
split project uniformly into sub-projects (new; sub-projects inherit parent options) More... | |
bool | SplitProject (vector< CConcIndexator *> &Subs, const string &SubOptions) const |
split project uniformly into sub-projects (guts, given sub-options as string) More... | |
bool | CreateMorphIndexWrapper () |
creates morphology index More... | |
DWORD | GetMaxTokenCountInOnePeriod () const |
returns the size of one subcorpus More... | |
DWORD | GetMaxInputLoadIndexSize () const |
returns the max size of input index in tokens must be less than GetMaxTokenCountInOnePeriod() More... | |
![]() | |
CConcordance () | |
~CConcordance () | |
void | RegisterIndicesToShow (const string &IndexListStr) |
wrapper for IndicesToShow option; now accepts integer position (min=1), long name, short name, or alias More... | |
string | GetIndicesToShowStr (bool Positional=false) const |
get opt-style string declaration for IndicesToShow (empty if default) More... | |
RML_RE::Options | GetRegexOptions () const |
return default pcre regex options More... | |
bool | IsDwdsCorpusInterface () const |
return true, if DDC outputs results in DWDS format More... | |
bool | IsGutenbergInterface () const |
return true, if DDC outputs results in Gutenberg project format More... | |
bool | HasContextOperator () const |
return true, if query context operator (#Cntxt) is switched off More... | |
bool | UseDwdsThesaurus () const |
return true, if DWDS thesaurus is enabled (index "Thes") More... | |
bool | OutputBibliographyOfHits () const |
return true, if DDC should output bibliographical information for hits instead of corpus file names More... | |
bool | IndexPunctuation () const |
wrapper for m_bIndexPunctuation More... | |
string | GetHtmlReference (size_t posFile) const |
get an HTML formatted reference to a corpus file More... | |
string | GetShortFilename (size_t posFile) const |
get a reference to a corpus file without the common left prefix More... | |
string | GetFileNameForCorpusFileNames () const |
get file name for storing corpus file names More... | |
string | GetFileNameForMaskedFiles () const |
get file name for masked files (strings) More... | |
string | GetFileNameForMaskedFileIds () const |
get file name for masked files (IDs, binary) More... | |
vector< string > | GetTokenFields (const COutputToken &tok) |
parse a delimited token into fields by splitting on m_InterpDelimiter More... | |
size_t | GetCorpusFilesCount () const |
get the number of indexed corpus files More... | |
string | GetCorpusFile (CFileNo FileNo) const |
get corpus file by index More... | |
size_t | GetMaskedFilesCount () const |
get count of masked files More... | |
void | LoadSourceFilesAndOptions (string FileName, bool reallyReadSourceFiles=true) |
load list of source files and parses option file (*.opt) More... | |
void | LoadCorpusFiles () |
load list of corpus files (*.con) More... | |
string | GetCommonFilePrefix () const |
(re-)compute common prefix of corpus files in m_CorpusFiles More... | |
void | LoadMaskedFiles () |
load list of masked (deleted)Corpus File Definition More... | |
void | LoadOptionsFromFile (const string &OptFile) |
loads options from a named file (calls LoadOptionsFromString()) More... | |
void | LoadProject (string FileName, bool includeSourceFiles=true) |
loads everything More... | |
time_t | UpdateTimestamp (const char *filename) |
update m_Timestamp to mtime of Filename if it is newer than the current m_Timestamp; More... | |
void | DumpBibliography (FILE *f=stdout) const |
dump loaded metadata to a stream (JSON) More... | |
string | DumpFileBibliography (DWORD FileNo) const |
dump file metadata to a string (JSON) More... | |
void | DumpFileIndexJson (DWORD FileNo, FILE *f=stdout) const |
dump file data to a file (JSON) More... | |
void | DumpIndex (string dirname, CIndexDumpFormat fmt=idfJson) const |
dump index data to a directory (format-dependent) More... | |
void | DumpIndexToSingleTabFile (FILE *outfp) const |
dump index data to a single tab formatted file More... | |
bool | SaveOptions (string FileName) const |
saves options to option file (*.opt) More... | |
bool | GetAllOccurrences (vector< CTokenNo > &occurrences, size_t searchPeriodNo) const |
moo: get a vector of all occurrences in a given search period; used by universal wildcard (*) queries More... | |
bool | GetOccurrencesByPosition (const string &BreakName, int anchor, vector< CTokenNo > &occurrences, size_t searchPeriodNo) const |
moo: search named break collection for matching positions, used by anchor queries More... | |
bool | UseTabFormatForLoading () const |
![]() | |
CStringIndexator () | |
~CStringIndexator () | |
bool | RegisterStringIndices (const string &IndicesStr) |
read index declarations from a string and register them More... | |
bool | RegisterIndexAliases (const string &IndexAliasStr) |
read index alias declarations from a string and register them; returns true iff all registrations were successful More... | |
bool | RegisterIndexAlias (const string &AliasFrom, const string &AliasTo) |
register a single index alias (low-level); returns true iff AliasTo resolves to a known index according to m_AliasMap More... | |
void | RegisterIndexAlias (const string &AliasFrom, CStringIndexSet *idx) |
register a single index label or alias (lowest-level); if idx is NULL, any existing entry for AliasFrom will be deleted More... | |
void | SetPath (string Path) |
set the path to the indices More... | |
string | GetIndicesString () const |
return all registered index declarations, in opt-file syntax More... | |
string | GetIndexAliasString () const |
return all registered index aliases, in opt-file syntax More... | |
size_t | GetSearchPeriodsCount () const |
return the number of corpus periods More... | |
const CTokenNo & | GetSearchPeriod (size_t i) const |
get a corpus period by an index More... | |
bool | StartIndexing (string Path) |
call CreateTempFiles for all registered indices More... | |
void | TerminateIndexing () |
call DeleteTempFiles for all registered indices More... | |
bool | FinalSaveAllIndices (bool bAfterLoading) |
final saving all indices to disk (converting temp files to persistent) More... | |
bool | AddInputLoadIndexToMemoryLoadIndex () |
unites input index with memory index and clears input load index More... | |
bool | AddMemoryLoadIndexToMainLoadIndex () |
unites memory index with main index and clears memory load index More... | |
bool | SaveMemoryLoadIndex () |
store memory load index on the disk More... | |
CStringIndexSet * | GetIndexByName (const string &Name) |
return a pointer to the index by CStringIndexSet::m_Name (linear search) More... | |
CStringIndexSet * | GetIndexByNameOrShortName (const string &Name) |
return a pointer to the index by CStringIndexSet::m_Name or CStringIndexSet::m_ShortName (linear search) More... | |
CStringIndexSet * | GetIndexByAlias (const string &Alias) const |
return a pointer to the index by long-name, short-name, or alias (most abstract, uses m_IndexMap) More... | |
CStringIndexSet * | GetTokenIndex () |
return the first index that normally contains tokens themselves More... | |
const CStringIndexSet * | GetTokenIndex () const |
return the first index that normally contains tokens themselves More... | |
![]() | |
const CBreakCollection * | GetBreakCollectionByName (const string &Name) const |
moo: get break collection by long or short name More... | |
const vector< CBreakCollection > & | GetBreaks (void) const |
moo: get break collection map (dangerous) More... | |
CHitBorders () | |
string | GetBorderIndicesString () const |
return the string representation of break collection descriptions More... | |
string | WithinBreakName (const vector< string > &Within) const |
const ddcBreakVector * | GetBreaksByName (const string &ShortName) const |
returns a break collection by a short name More... | |
CTokenNo | GetCorpusEndTokenNo () const |
returns the value of the last file break (which should be equal to the last value of any break collection) More... | |
const ddcBreakVector & | GetFileBreaks () const |
quick reference to file breaks More... | |
CTokenNo | GetFileStartTokenNo (size_t FileNo) const |
returns the start position of corpus file FileNo More... | |
DWORD | GetPageNumber (size_t No) const |
returns m_PageBreaks[No].m_PageNumber (see CPageNumber) More... | |
bool | IsRegisteredBreak (const string &ShortName) const |
returns true if a short name is found in m_Breaks More... | |
void | RegisterBorderIndices (const char *IndicesStr) |
creates empty elements of m_Breaks by its string descriptions More... | |
bool | LoadHitBorders (string Path, bool useMMap=false) |
load break collections from the disk More... | |
void | ConvertHitsToPageBreaks (vector< CHit >::const_iterator hits_begin, vector< CHit >::const_iterator hits_end, const ddcBreakVector &Breaks, DwordVector &PageBreaks) const |
converts hits to page breaks, which contains this breaks More... | |
ddcVecFile< CPageNumber >::const_iterator | GetTokenPageBreak (CTokenNo tok) const |
get page break for a given token number as an interator into m_PageBreaks More... | |
void | AddBreakByName (const string &ShortName, const CTokenNo &B) |
adds one break to a collection identified by a short name (during indexing) More... | |
void | BordersEndIndexing (string Path) |
closes all CBreakCollectionDescr::m_FileForIndexing from m_Breaks (during indexing) More... | |
void | StartTextAreaBorders () |
must be called before indexing each text area in order to create at least on break in each text area More... | |
void | EndTextAreaBorders (DWORD TextAreaEndTokenNo) |
must be called after indexing each text area in order to create at least on break in each text area More... | |
![]() | |
CSourceFileHolder () | |
bool | SaveSourceFileList (string FileName) |
Saves the list of source files to file *.con. More... | |
void | DeleteSourceFile (long ItemNo) |
Deletes a source file. More... | |
void | AddSourceFile (const char *FileName) |
Adds a source file. More... | |
void | DeleteAllSourceFiles () |
deletes all Source File Definition source files More... | |
size_t | GetSourceFilesCount () const |
string | GetSourceFile (size_t FileNo) const |
get the source file by the index More... | |
void | AddSourceFilesFrom (const CSourceFileHolder &X) |
void | ReadSourceFileList (string FileName) |
int | FoundNotExistedFile () const |
finds a source file which does not exist, if there is no such file, returns -1 More... | |
bool | IsModified () const |
Private Member Functions | |
void | IndexTextOrHtmlFile (CIndexDocument *document) |
void | IndexMorphXml (CIndexDocument *document) |
void | IndexFreeIndex (CIndexDocument *document) |
void | IndexOneTableTextArea (const string &Text, const CPageNumber &StartPageFromHeader, size_t &page_breaks_count, CIndexDocument *document) |
bool | IsDWDSToken (long GraLine) const |
graphematical definition of a token for DWDSIndex More... | |
void | LoadXmlFile (string FileName, const char *pFileBuffer, CBibliography &Bibl) |
void | LoadFileIntoGraphan (string FileName, const char *pFileBuffer, CBibliography &Bibl) |
Private Attributes | |
CGraphmatFile | Graphmat |
graphmat (tokenization parser) More... | |
const CDwdsThesaurus * | m_pDwdsThesaurus |
a reference to DWDS thesaurus if applicable More... | |
Additional Inherited Members | |
![]() | |
typedef map< string, string > | IndexAliasMap |
typedef for index alias maps More... | |
typedef map< string, CStringIndexSet * > | IndexMap |
typedef for index symbol table More... | |
![]() | |
MorphLanguageEnum | m_Language |
the language of the corpus More... | |
bool | m_bIndexMorphPatterns |
Enables the index of morph patterns. More... | |
bool | m_bIndexChunks |
Enables indexing and querying using chunks. More... | |
bool | m_bCaseSensitive |
if true, then the default search is case sensitive More... | |
bool | m_bShowNumberOfRelevantDocuments |
if true, then DDC always calculates the number of documents, where at lease one hit is found More... | |
bool | m_bQueryOnlyFiles |
prohibits sentence break collection under DWDS_Index or MorphXML_Index More... | |
bool | m_bArchiveIndex |
sets that index should be archived under DWDS_Index or MorphXML_Index More... | |
bool | m_bResumeOnIndexErrors |
if true, CConcIndexatorInvoker skips source documents with errors More... | |
ddcCorpusList | m_CorpusFiles |
Corpus files More... | |
CMaskedFileSet | m_MaskedFiles |
CConcXml | m_BiblIndex |
a member which holds a index for bibliographical information More... | |
CHighlightTags | m_HtmlHighlighting |
highlighting tags for CConcHolder::m_ResultFormat == DDC_ResultHTML More... | |
CHighlightTags | m_TextHighlighting |
highlighting delimeters for CConcHolder::m_ResultFormat == DDC_ResultText More... | |
CHighlightTags | m_TableHighlighting |
highlighting delimeters for CConcHolder::m_ResultFormat == DDC_ResultTable More... | |
bool | m_bDisableDefaultQueryLexicalExpansion |
if true, then no default lexical expansion fo querz words occurs More... | |
int | m_LeftKwicContextSize |
the size of the left context of the highlighted words in document search More... | |
int | m_RightKwicContextSize |
the size of the right context of the highlighted words in document search More... | |
int | m_NumberOfKwicLinesInSnippets |
the maximal number of kwic lines in file snippets More... | |
double | m_TfIdfRank |
the parameter for TfIdf ranking More... | |
double | m_NearRank |
the parameter for Near ranking More... | |
double | m_PositionRank |
the parameter for Position ranking More... | |
string | m_InterpDelimiter |
delimiter to use between token index fields in output More... | |
string | m_TokenDelimiter |
delimiter to use between tokens in output More... | |
bool | m_Utf8 |
whether to assume indexed data is utf8 encoded (default=no) More... | |
bool | m_bAllowUnsafeQueries |
potentially unsafe queries will throw an exception unless this is true (default=false) More... | |
bool | m_bAllowCountByTokenAttributes |
using any token attribute as a count-key will throw an exception unless this is true (default=true) More... | |
bool | m_bLemmaQueryUsesMorphPattern |
interpret "%foo" queries using MorphPattern? (default=true) More... | |
vector< size_t > | m_IndicesToShow |
indices to show for Free_Index More... | |
time_t | m_Timestamp |
moo: timestamp of project *._con file More... | |
size_t | m_MaxCachedHitsCount |
moo: maximum number of hits in a CConcHolder cache entry – query results with more than MaxCachedHitsCount hits will not be cached (default=512) More... | |
size_t | m_MaxQueryCacheSize |
moo: maximum number of queries to be cached by an associated CConcHolder (default=512) More... | |
TxDispatcher | m_Txd |
term expansion dispatcher; should define at least an entry for "default" More... | |
map< string, string > | m_OpDefaultIndexNames |
maps token-query operators to default index names; keys are as returned by CQToken::OperatorKey() More... | |
map< string, pair< bool, string > > | m_ServerInfo |
maps symbolic keys to string constants to be included in corpus 'info' response as info.user.KEY=VAL values are pairs (isFile,Value) s.t. Value is filename iff isFile is a true value, otherwise a literal value More... | |
![]() | |
string | m_Path |
where all indices are stored More... | |
bool | m_bMemoryMap |
whether to directly mmap() index file data (default=false) More... | |
vector< CStringIndexSet * > | m_Indices |
the registered indices, by positional index More... | |
IndexAliasMap | m_IndexAlias |
declared index aliases (FROM -> TO); not really used at runtime More... | |
IndexMap | m_IndexMap |
all registered indices, keyed by long-name, short-name, or label (LABEL -> INDEX) More... | |
size_t | m_MaxRegExpExpansionSize |
the maximal number of index items which can be included in an expansion set of one regular expression More... | |
CStringIndexSet * | m_pChunkIndex |
a quick reference to a chunk index, if CConcIndexator::m_bIndexChunks is on, otherwise null More... | |
![]() | |
enum | DDCIndexTypeEnum { DWDS_Index, MorphXML_Index, Free_Index, TabFormat_Index } |
enum DDCIndexTypeEnum contains index types. Each index type determines DDC indices and break collections. More... | |
![]() | |
void | IndexTabFormat (CIndexDocument *document) |
![]() | |
void | AssertHasPath () const |
void | LoadOptionsFromString (string Options) |
loads options from a string More... | |
void | InitDefaultOptions () |
string | SaveOptionsToString () const |
saves options to a string More... | |
![]() | |
bool | RegisterChunkIndex () |
register chunk index (chunks:NP, VP etc) More... | |
string | GetSearchPeriodsFileName () const |
return the file name for search periods More... | |
bool | DestroyIndices () |
call DestroyIndexSet for all registered indices More... | |
void | ReadIndicesFromTheDisk () |
call ReadFromTheDisk for all registered indices More... | |
void | ClearStringIndices () |
clear m_Indices More... | |
void | IndexOneToken (CTokenIndexator *document, const char *Line, bool tryFixErrors=true) |
index one token and its properies (delimited by CConcCommon.h::globalFieldDelimeter) More... | |
void | IndexTokenFixLongColumns (const size_t MaxLen, const size_t nCols, const char *InputLine, char *Out) |
moo: truncate long columns in InputLine, storing result in Out More... | |
![]() | |
string | GetPageBreaksFileName (string Path) const |
returns the file name for page breaks More... | |
string | GetShortNameByName (const string &BreakName) const |
returns the short name of a break collection by the long or the short name More... | |
bool | StartIndexing (string Path) |
opens for writing all CBreakCollectionDescr::m_FileForIndexing from m_Breaks More... | |
bool | RemoveHitBordersFileAndClear (string Path) |
deletes all break files More... | |
void | AddPageBreak (const CPageNumber &P) |
adds one page break More... | |
void | SavePageBreaks (const string &ProjectPath) |
save page break file More... | |
int | RegisterBreak (string ShortName, string LongName) |
int | EnsureRegisteredBreak (string ShortName, string LongName) |
int | GetBreakCollectionIndexByName (string ShortName) const |
void | AddBreakByIndex (DWORD BreakCollectionNo, const CTokenNo &B) |
![]() | |
DDCIndexTypeEnum | m_IndexType |
the type of index More... | |
bool | m_bIndexPunctuation |
Enables indexing all punctuation marks. More... | |
bool | m_bUseParagraphTagToDivide |
Enables using "<p>" tag as a paragraph delimiter. More... | |
bool | m_bEmptyLineIsSentenceDelim |
if m_bEmptyLineIsSentenceDelim is on, every empty line in the input file is considered to be the end of the sentence. More... | |
bool | m_bUseIndention |
if m_bUseIndention is on, the program tries to find paragraphs using indentions More... | |
DWORD | m_UserMaxTokenCountInOnePeriod |
The maximal number of occurrences in one subcorpora (defined by user) More... | |
DWORD | m_UserMaxInputLoadIndexSize |
The maximal number of occurrences in the input load index, by default 400000. More... | |
bool | m_bUseDwdsThesaurus |
Enables indexing and querying using DWDS Thesaurus. More... | |
![]() | |
vector< CTokenNo > | m_SearchPeriods |
search periods of the corpus More... | |
![]() | |
vector< CBreakCollection > | m_Breaks |
all breaks More... | |
map< string, int > | m_ShortName2BreakCollection |
the map from CBreakCollection.m_ShortName to the index in m_Breaks More... | |
map< string, int > | m_LongName2BreakCollection |
the map from CBreakCollection.m_LongName to the index in m_Breaks More... | |
int | m_FileBreakCollectionNo |
a quick reference to file breaks (which are also stored in m_Breaks) More... | |
string | m_DefaultBreakName |
The name of the default break collection (written in the options file) More... | |
ddcVecFile< CPageNumber > | m_PageBreaks |
page number collection More... | |
vector< DWORD > | m_LastTextAreaBreaks |
![]() | |
vector< string > | m_SourceFiles |
Source files More... | |
bool | m_bModifiedListOfFiles |
CConcIndexator is the class for corpus indexing. The most of its slots come from Concordance class.
CConcIndexator::CConcIndexator | ( | ) |
References CConcordance::InitDefaultOptions(), and m_pDwdsThesaurus.
Referenced by CreateAsUnion().
CConcIndexator::~CConcIndexator | ( | ) |
|
private |
References CHitBorders::AddBreakByIndex(), CHitBorders::AddBreakByName(), CConcXml::AddIndexItem(), CHitBorders::AddPageBreak(), CTokenIndexator::CorpusEndTokenNo, CIndexDocument::CorpusFileName, CIndexDocument::DocumentBuffer, Format(), CUnitHolder::FreeTable(), CDwdsThesaurus::GetAllThesInterpetations(), CHitBorders::GetBreakCollectionIndexByName(), CUnitHolder::GetPageNumber(), CBiblIndex::GetTextAreasCount(), CUnitHolder::GetTokensCount(), CUnitHolder::GetUnits(), CUnitHolder::GetUppercaseToken(), globalFieldDelimeter, Graphmat, CUnitHolder::HasDescr(), CStringIndexator::IndexOneToken(), IsDWDSToken(), IsSentenceEnd(), LoadFileIntoGraphan(), CConcordance::m_BiblIndex, CConcordance::m_bQueryOnlyFiles, CConcordance::m_bResumeOnIndexErrors, CConcordance::m_bUseDwdsThesaurus, CPageNumber::m_PageNumber, m_pDwdsThesaurus, CPageNumber::m_StartTokenNo, OUp, OUpLw, and PredefinedTextAreaBreakName.
Referenced by IndexOneFile().
|
private |
References CHitBorders::AddBreakByName(), CConcXml::AddIndexItem(), CHitBorders::AddPageBreak(), CTokenIndexator::CorpusEndTokenNo, CIndexDocument::CorpusFileName, CIndexDocument::DocumentBuffer, CXmlMorphAnnot::GetAsSetOfProperties(), GetIndexItemSetByVectorString(), globalFieldDelimeter, CStringIndexator::IndexOneToken(), CXmlToken::m_Annots, CConcordance::m_BiblIndex, CXmlToken::m_bLastInSentence, CConcordance::m_bResumeOnIndexErrors, CXmlMorphAnnot::m_Lemma, CPageNumber::m_PageNumber, CPageNumber::m_StartTokenNo, CXmlToken::m_Type, CXmlToken::m_WordStr, MorphAnnotationsDelim, CConcXml::ReadMorphXmlFileIntoGraTable(), UnknownPageNumber, and CBibliography::WriteToString().
Referenced by IndexOneFile().
|
private |
References CConcXml::AddIndexItem(), CHitBorders::AddPageBreak(), CTokenIndexator::CorpusEndTokenNo, CIndexDocument::CorpusFileName, CIndexDocument::DocumentBuffer, GetCWBFormattedStringRecursive(), CBiblIndex::GetTextAreaElements(), IndexOneTableTextArea(), CConcXml::LoadXmlAndReadBibliography(), CConcordance::m_BiblIndex, CPageNumber::m_PageNumber, CBibliography::m_StartPageInfo, CPageNumber::m_StartTokenNo, and UnknownPageNumber.
Referenced by IndexOneFile().
|
private |
References CHitBorders::AddBreakByIndex(), CHitBorders::AddPageBreak(), CTokenIndexator::CorpusEndTokenNo, CHitBorders::EndTextAreaBorders(), Format(), CHitBorders::GetBreakCollectionIndexByName(), CStringIndexator::IndexOneToken(), CIndexSetForLoadingStage::InsertToInputLoadIndex(), CHitBorders::IsRegisteredBreak(), CConcordance::m_bIndexChunks, CConcordance::m_bResumeOnIndexErrors, CPageNumber::m_PageNumber, CStringIndexator::m_pChunkIndex, CPageNumber::m_StartTokenNo, CHitBorders::StartTextAreaBorders(), and Trim().
Referenced by IndexFreeIndex().
|
private |
graphematical definition of a token for DWDSIndex
graphematical definition of a token
References Graphmat, CUnitHolder::HasDescr(), IsDigit(), IsSentenceEnd(), IsWord(), CConcordance::m_bIndexPunctuation, and OPun.
Referenced by IndexTextOrHtmlFile().
|
private |
References Format(), CGraphmatFile::GetLastError(), CBiblIndex::GetTextAreaElements(), CBiblIndex::GetTextAreasCount(), GetTextFromXMLRecursive(), Graphmat, CGraphmatFile::LoadStringToGraphan(), CConcXml::LoadXmlAndReadBibliography(), CConcordance::m_BiblIndex, CBibliography::m_StartPageInfo, CExpc::m_strCause, and UnknownPageNumber.
Referenced by LoadFileIntoGraphan().
|
private |
References CBibliography::CleanBibliography(), Format(), CGraphmatFile::GetLastError(), Graphmat, IsXmlFile(), CGraphmatFile::LoadStringToGraphan(), LoadXmlFile(), CConcordance::m_BiblIndex, CExpc::m_strCause, and CConcXml::SetFreeBiblAttribsEmpty().
Referenced by IndexTextOrHtmlFile().
void CConcIndexator::InitGraphan | ( | ) |
initializes graphematics using current options
References Format(), CGraphmatFile::GetLastError(), Graphmat, CGraphmatFile::LoadDicts(), CGraphmatFile::m_bConvertRussianJo2Je, CGraphmatFile::m_bEmptyLineIsSentenceDelim, CConcordance::m_bEmptyLineIsSentenceDelim, CGraphmatFile::m_bFilterUnprintableSymbols, CGraphmatFile::m_bUseIndention, CConcordance::m_bUseIndention, CGraphmatFile::m_bUseParagraphTagToDivide, CConcordance::m_bUseParagraphTagToDivide, CUnitHolder::m_Language, and CConcordance::m_Language.
Referenced by StartIndexing().
bool CConcIndexator::SaveCorpusFileList | ( | ) | const |
saves corpus file list (*._con)
References DDCVersionMinW(), DDCVersionStr, CConcordance::GetCommonFilePrefix(), CConcordance::GetFileNameForCorpusFileNames(), CConcordance::m_CorpusFiles, ddcCorpusList< OffT_ >::size(), and DDCVersionT::str().
Referenced by CreateAsUnion(), ConcIndexatorInvoker::FinalizeIndex(), and SplitProject().
bool CConcIndexator::SaveMaskedFileIds | ( | ) | const |
saves masked file-ids (*._masked_ids)
References CConcordance::GetFileNameForMaskedFileIds(), CConcordance::m_MaskedFiles, and WriteVector().
Referenced by CreateAsUnion(), and SplitProject().
void CConcIndexator::StartIndexing | ( | ) |
begins indexing
References InitGraphan(), LoadDwdsThesaurus(), CConcordance::m_BiblIndex, CStringIndexator::m_Path, m_pDwdsThesaurus, CConcXml::Start(), CHitBorders::StartIndexing(), CStringIndexator::StartIndexing(), and CConcordance::UseDwdsThesaurus().
Referenced by ConcIndexatorInvoker::BuildIndexStart().
void CConcIndexator::DestroyIndex | ( | ) |
destroy all index files
References CConcordance::AssertHasPath(), ddcCorpusList< OffT_ >::clear(), CStringIndexator::DestroyIndices(), FileExists(), CConcordance::GetFileNameForCorpusFileNames(), CConcordance::GetFileNameForMaskedFileIds(), CConcordance::GetFileNameForMaskedFiles(), CStringIndexator::GetSearchPeriodsFileName(), CConcordance::m_CorpusFiles, CConcordance::m_MaskedFiles, CStringIndexator::m_Path, and CHitBorders::RemoveHitBordersFileAndClear().
Referenced by ConcIndexatorInvoker::BuildIndex(), and ConcIndexatorInvoker::BuildIndexStart().
void CConcIndexator::NormalEndIndexing | ( | ) |
finishes indexing (normal way)
References CConcXml::FinalSaveBibliography(), FreeDwdsThesaurus(), CConcordance::m_BiblIndex, and m_pDwdsThesaurus.
Referenced by ConcIndexatorInvoker::FinalizeIndex().
void CConcIndexator::TerminateIndexing | ( | ) |
terminates indexing (for exceptions)
References CHitBorders::BordersEndIndexing(), CConcXml::ExitWithoutSave(), CConcordance::m_BiblIndex, CStringIndexator::m_Path, and CStringIndexator::TerminateIndexing().
Referenced by ConcIndexatorInvoker::BuildIndex().
void CConcIndexator::IndexOneFile | ( | CIndexDocument * | document | ) |
index one file according to m_IndexType
References CreateMorphIndex(), CConcordance::DWDS_Index, CConcordance::Free_Index, IndexFreeIndex(), IndexMorphXml(), CTabFormatIndexator::IndexTabFormat(), IndexTextOrHtmlFile(), CConcordance::m_IndexType, CConcordance::MorphXML_Index, and CConcordance::TabFormat_Index.
Referenced by ConcIndexatorInvoker::IndexFile().
void CConcIndexator::RollbackIndexOneFile | ( | CTokenNo | startTrimTokenNo | ) |
rollback data buffered by an immediate preceding failed IndexOneFile()
References ddcLogWarn, Format(), and CStringIndexator::m_Indices.
Referenced by ConcIndexatorInvoker::IndexFile().
void CConcIndexator::CalculateSearchPeriods | ( | DWORD | MaxTokenCountInOnePeriod | ) |
finds all subcorpora
References CHitBorders::GetCorpusEndTokenNo(), CHitBorders::GetFileBreaks(), CHitBorders::GetFileStartTokenNo(), CConcordance::m_CorpusFiles, CStringIndexator::m_SearchPeriods, and ddcCorpusList< OffT_ >::size().
Referenced by CreateAsUnion(), and ConcIndexatorInvoker::FinalizeIndex().
bool CConcIndexator::CreateAsUnion | ( | vector< CConcIndexator * > & | X, |
bool | inheritOptions = true |
||
) |
creates new concordance as union of one or more concordances (new)
References CSourceFileHolder::AddSourceFilesFrom(), CHitBorders::CBreakCollection::AppendBreaks(), ddcVecFile< T >::begin(), CalculateSearchPeriods(), ddcCorpusList< OffT_ >::clear(), ddcVecFile< T >::clear(), ddcLogInfo, ddcLogWarn, CSourceFileHolder::DeleteAllSourceFiles(), ddcVecFile< T >::end(), ddcVecFile< T >::ensureVec(), CStringIndexator::FinalSaveAllIndices(), Format(), CHitBorders::GetCorpusEndTokenNo(), CConcordance::GetFileNameForCorpusFileNames(), CConcordance::GetFileNameForMaskedFileIds(), CIndexItem::GetMaximalNumberOfRunningTokens(), GetMaxTokenCountInOnePeriod(), CConcordance::LoadOptionsFromString(), CConcordance::m_BiblIndex, CHitBorders::CBreakCollection::m_BreakOffsets, CHitBorders::m_Breaks, CConcordance::m_CorpusFiles, CHitBorders::m_FileBreakCollectionNo, CStringIndexator::m_Indices, CConcordance::m_MaskedFiles, CStringIndexSet::m_Name, CHitBorders::m_PageBreaks, CStringIndexator::m_Path, CStringIndexator::m_SearchPeriods, CStringIndexSet::m_ShortName, ddcCorpusList< OffT_ >::m_strings, ddcVecFile< T >::m_vec, MakeFName(), PredefinedFileBreakName, SaveCorpusFileList(), SaveMaskedFileIds(), CConcordance::SaveOptions(), CConcordance::SaveOptionsToString(), CHitBorders::SavePageBreaks(), CSourceFileHolder::SaveSourceFileList(), CBiblIndex::SetPath(), ddcVecFile< T >::size(), CConcXml::UnionBibliographies(), and CStringIndexSet::UnionIndexSets().
Referenced by CreateAsUnion().
bool CConcIndexator::CreateAsUnion | ( | const vector< string > & | Xfiles, |
bool | inheritOptions = true |
||
) |
creates new concordance as union of one or more concordances (new, filename-based)
References CConcIndexator(), CreateAsUnion(), and ddcLogInfo.
bool CConcIndexator::SplitProject | ( | vector< CConcIndexator *> & | Subs | ) | const |
split project uniformly into sub-projects (new; sub-projects inherit parent options)
References CConcordance::SaveOptionsToString().
bool CConcIndexator::SplitProject | ( | vector< CConcIndexator *> & | Subs, |
const string & | SubOptions | ||
) | const |
split project uniformly into sub-projects (guts, given sub-options as string)
References ddcVecFile< T >::begin(), ddcVecFile< T >::clear(), CStringIndexSet::CreateSplitPartitions(), ddcLogInfo, ddcLogWarn, ddcVecFile< T >::end(), ddcVecFile< T >::ensureData(), ddcVecFile< T >::ensureVec(), FileDirectory(), CStringIndexator::FinalSaveAllIndices(), Format(), CHitBorders::GetBreakCollectionByName(), CHitBorders::CBreakCollection::GetBreakFileName(), CHitBorders::GetCorpusEndTokenNo(), CConcordance::GetCorpusFilesCount(), CHitBorders::GetFileBreaks(), CConcordance::GetFileNameForCorpusFileNames(), GetMaxTokenCountInOnePeriod(), CConcordance::m_BiblIndex, CHitBorders::CBreakCollection::m_BreakOffsets, CHitBorders::m_Breaks, CConcordance::m_CorpusFiles, CStringIndexator::m_Indices, CConcordance::m_MaskedFiles, CStringIndexSet::m_Name, CHitBorders::m_PageBreaks, CStringIndexator::m_Path, CStringIndexSet::m_ShortName, CSourceFileHolder::m_SourceFiles, ddcVecFile< T >::m_vec, MakeDirP(), MakeFName(), SaveCorpusFileList(), SaveMaskedFileIds(), CConcordance::SaveOptionsToString(), CSourceFileHolder::SaveSourceFileList(), SaveStringToFile(), CHitBorders::CBreakCollection::SaveToFile(), ddcCorpusList< OffT_ >::size(), and CConcXml::SplitBibliography().
bool CConcIndexator::CreateMorphIndexWrapper | ( | ) |
creates morphology index
References CreateMorphIndex(), GetMaxTokenCountInOnePeriod(), CConcordance::m_Language, and CStringIndexator::m_Path.
Referenced by ConcIndexatorInvoker::BuildOnlyMorphIndex(), and ConcIndexatorInvoker::FinalizeIndex().
DWORD CConcIndexator::GetMaxTokenCountInOnePeriod | ( | ) | const |
returns the size of one subcorpus
References DefaultMaxTokenCountInOnePeriod, and CConcordance::m_UserMaxTokenCountInOnePeriod.
Referenced by ConcIndexatorInvoker::BuildIndex(), ConcIndexatorInvoker::BuildIndexStart(), CreateAsUnion(), CreateMorphIndexWrapper(), ConcIndexatorInvoker::IndexFiles(), ConcIndexatorInvoker::ProcessTarOrSingleFile(), and SplitProject().
DWORD CConcIndexator::GetMaxInputLoadIndexSize | ( | ) | const |
returns the max size of input index in tokens must be less than GetMaxTokenCountInOnePeriod()
References DefaultMaxInputLoadIndexSize, and CConcordance::m_UserMaxInputLoadIndexSize.
Referenced by ConcIndexatorInvoker::BuildIndexStart(), ConcIndexatorInvoker::IndexFiles(), and ConcIndexatorInvoker::ProcessTarOrSingleFile().
|
private |
graphmat (tokenization parser)
Referenced by IndexTextOrHtmlFile(), InitGraphan(), IsDWDSToken(), LoadFileIntoGraphan(), and LoadXmlFile().
|
private |
a reference to DWDS thesaurus if applicable
Referenced by CConcIndexator(), IndexTextOrHtmlFile(), NormalEndIndexing(), and StartIndexing().