ddc
|
#include <HitBorder.h>
Classes | |
struct | CBreakCollection |
Public Member Functions | |
const CBreakCollection * | GetBreakCollectionByName (const string &Name) const |
moo: get break collection by long or short name More... | |
const vector< CBreakCollection > & | GetBreaks (void) const |
moo: get break collection map (dangerous) More... | |
CHitBorders () | |
string | GetBorderIndicesString () const |
return the string representation of break collection descriptions More... | |
string | WithinBreakName (const vector< string > &Within) const |
const ddcBreakVector * | GetBreaksByName (const string &ShortName) const |
returns a break collection by a short name More... | |
CTokenNo | GetCorpusEndTokenNo () const |
returns the value of the last file break (which should be equal to the last value of any break collection) More... | |
const ddcBreakVector & | GetFileBreaks () const |
quick reference to file breaks More... | |
CTokenNo | GetFileStartTokenNo (size_t FileNo) const |
returns the start position of corpus file FileNo More... | |
DWORD | GetPageNumber (size_t No) const |
returns m_PageBreaks[No].m_PageNumber (see CPageNumber) More... | |
bool | IsRegisteredBreak (const string &ShortName) const |
returns true if a short name is found in m_Breaks More... | |
void | RegisterBorderIndices (const char *IndicesStr) |
creates empty elements of m_Breaks by its string descriptions More... | |
bool | LoadHitBorders (string Path, bool useMMap=false) |
load break collections from the disk More... | |
void | ConvertHitsToPageBreaks (vector< CHit >::const_iterator hits_begin, vector< CHit >::const_iterator hits_end, const ddcBreakVector &Breaks, DwordVector &PageBreaks) const |
converts hits to page breaks, which contains this breaks More... | |
ddcVecFile< CPageNumber >::const_iterator | GetTokenPageBreak (CTokenNo tok) const |
get page break for a given token number as an interator into m_PageBreaks More... | |
void | AddBreakByName (const string &ShortName, const CTokenNo &B) |
adds one break to a collection identified by a short name (during indexing) More... | |
void | BordersEndIndexing (string Path) |
closes all CBreakCollectionDescr::m_FileForIndexing from m_Breaks (during indexing) More... | |
void | StartTextAreaBorders () |
must be called before indexing each text area in order to create at least on break in each text area More... | |
void | EndTextAreaBorders (DWORD TextAreaEndTokenNo) |
must be called after indexing each text area in order to create at least on break in each text area More... | |
Protected Member Functions | |
string | GetPageBreaksFileName (string Path) const |
returns the file name for page breaks More... | |
string | GetShortNameByName (const string &BreakName) const |
returns the short name of a break collection by the long or the short name More... | |
bool | StartIndexing (string Path) |
opens for writing all CBreakCollectionDescr::m_FileForIndexing from m_Breaks More... | |
bool | RemoveHitBordersFileAndClear (string Path) |
deletes all break files More... | |
void | AddPageBreak (const CPageNumber &P) |
adds one page break More... | |
void | SavePageBreaks (const string &ProjectPath) |
save page break file More... | |
int | RegisterBreak (string ShortName, string LongName) |
int | EnsureRegisteredBreak (string ShortName, string LongName) |
int | GetBreakCollectionIndexByName (string ShortName) const |
void | AddBreakByIndex (DWORD BreakCollectionNo, const CTokenNo &B) |
Protected Attributes | |
vector< CBreakCollection > | m_Breaks |
all breaks More... | |
map< string, int > | m_ShortName2BreakCollection |
the map from CBreakCollection.m_ShortName to the index in m_Breaks More... | |
map< string, int > | m_LongName2BreakCollection |
the map from CBreakCollection.m_LongName to the index in m_Breaks More... | |
int | m_FileBreakCollectionNo |
a quick reference to file breaks (which are also stored in m_Breaks) More... | |
string | m_DefaultBreakName |
The name of the default break collection (written in the options file) More... | |
ddcVecFile< CPageNumber > | m_PageBreaks |
page number collection More... | |
vector< DWORD > | m_LastTextAreaBreaks |
Class CHitBorders contains all break collections and all page breaks.
CHitBorders::CHitBorders | ( | ) |
References m_FileBreakCollectionNo.
|
protected |
returns the file name for page breaks
References MakeFName().
Referenced by LoadHitBorders(), RemoveHitBordersFileAndClear(), and SavePageBreaks().
|
protected |
returns the short name of a break collection by the long or the short name
References m_Breaks, and CHitBorders::CBreakCollection::m_ShortName.
Referenced by WithinBreakName().
|
protected |
opens for writing all CBreakCollectionDescr::m_FileForIndexing from m_Breaks
References ddcVecFile< T >::clear(), ErrorMessage(), Format(), CHitBorders::CBreakCollection::GetBreakFileName(), m_Breaks, CHitBorders::CBreakCollection::m_FileForIndexing, and m_PageBreaks.
Referenced by CConcIndexator::StartIndexing().
|
protected |
deletes all break files
References ddcVecFile< T >::clear(), CHitBorders::CBreakCollection::ClearAll(), FileExists(), GetPageBreaksFileName(), m_Breaks, and m_PageBreaks.
Referenced by CConcIndexator::DestroyIndex().
|
protected |
adds one page break
References ddcVecFile< T >::ensureVec(), m_PageBreaks, CPageNumber::m_PageNumber, CPageNumber::m_StartTokenNo, ddcVecFile< T >::m_vec, and UnknownPageNumber.
Referenced by CConcIndexator::IndexFreeIndex(), CConcIndexator::IndexMorphXml(), CConcIndexator::IndexOneTableTextArea(), and CConcIndexator::IndexTextOrHtmlFile().
|
protected |
save page break file
References GetPageBreaksFileName(), m_PageBreaks, and ddcVecFile< T >::save().
Referenced by BordersEndIndexing(), and CConcIndexator::CreateAsUnion().
|
protected |
References m_Breaks, m_LastTextAreaBreaks, CHitBorders::CBreakCollection::m_LongName, m_LongName2BreakCollection, CHitBorders::CBreakCollection::m_ShortName, and m_ShortName2BreakCollection.
Referenced by EnsureRegisteredBreak(), and RegisterBorderIndices().
|
protected |
References GetBreakCollectionIndexByName(), and RegisterBreak().
Referenced by RegisterBorderIndices().
|
protected |
References m_LongName2BreakCollection, and m_ShortName2BreakCollection.
Referenced by EnsureRegisteredBreak(), GetBreakCollectionByName(), CConcIndexator::IndexOneTableTextArea(), and CConcIndexator::IndexTextOrHtmlFile().
References Format(), m_Breaks, CHitBorders::CBreakCollection::m_FileForIndexing, m_LastTextAreaBreaks, CHitBorders::CBreakCollection::m_LongName, and save_to_bytes().
Referenced by AddBreakByName(), EndTextAreaBorders(), CConcIndexator::IndexOneTableTextArea(), and CConcIndexator::IndexTextOrHtmlFile().
const CHitBorders::CBreakCollection * CHitBorders::GetBreakCollectionByName | ( | const string & | Name | ) | const |
moo: get break collection by long or short name
References GetBreakCollectionIndexByName(), and m_Breaks.
Referenced by CQToken::BreakName(), CQCount::CountUniversal(), CConcordance::LoadOptionsFromString(), and CConcIndexator::SplitProject().
|
inline |
moo: get break collection map (dangerous)
Referenced by CDDCLeafServer::handle__info().
string CHitBorders::GetBorderIndicesString | ( | ) | const |
return the string representation of break collection descriptions
References Format(), m_Breaks, m_DefaultBreakName, CHitBorders::CBreakCollection::m_LongName, CHitBorders::CBreakCollection::m_ShortName, m_ShortName2BreakCollection, PredefinedFileBreakName, and Trim().
Referenced by CConcordance::LoadOptionsFromString(), and CConcordance::SaveOptionsToString().
string CHitBorders::WithinBreakName | ( | const vector< string > & | Within | ) | const |
returns the short name of the last valid break collection (long or short) in Within
References GetShortNameByName(), and m_DefaultBreakName.
Referenced by CQueryOptions::Compile().
const ddcBreakVector * CHitBorders::GetBreaksByName | ( | const string & | ShortName | ) | const |
returns a break collection by a short name
References m_Breaks, and m_ShortName2BreakCollection.
CTokenNo CHitBorders::GetCorpusEndTokenNo | ( | ) | const |
returns the value of the last file break (which should be equal to the last value of any break collection)
References ddcVecFile< T >::empty(), GetFileBreaks(), m_FileBreakCollectionNo, and ddcVecFile< T >::size().
Referenced by CConcIndexator::CalculateSearchPeriods(), CConcIndexator::CreateAsUnion(), ConcIndexatorInvoker::FinalizeIndex(), CDDCLeafServer::handle__info(), and CConcIndexator::SplitProject().
const ddcBreakVector & CHitBorders::GetFileBreaks | ( | ) | const |
quick reference to file breaks
References m_Breaks, and m_FileBreakCollectionNo.
Referenced by CConcIndexator::CalculateSearchPeriods(), CQCount::CountUniversal(), GetCorpusEndTokenNo(), GetFileStartTokenNo(), and CConcIndexator::SplitProject().
CTokenNo CHitBorders::GetFileStartTokenNo | ( | size_t | FileNo | ) | const |
returns the start position of corpus file FileNo
References ddcVecFile< T >::empty(), GetFileBreaks(), and m_FileBreakCollectionNo.
Referenced by CConcIndexator::CalculateSearchPeriods().
DWORD CHitBorders::GetPageNumber | ( | size_t | No | ) | const |
returns m_PageBreaks[No].m_PageNumber (see CPageNumber)
References ddcVecFile< T >::empty(), m_PageBreaks, ddcVecFile< T >::size(), and UnknownPageNumber.
bool CHitBorders::IsRegisteredBreak | ( | const string & | ShortName | ) | const |
returns true if a short name is found in m_Breaks
References m_ShortName2BreakCollection.
Referenced by CConcIndexator::IndexOneTableTextArea().
void CHitBorders::RegisterBorderIndices | ( | const char * | IndicesStr | ) |
creates empty elements of m_Breaks by its string descriptions
References EnsureRegisteredBreak(), Format(), m_Breaks, m_DefaultBreakName, m_FileBreakCollectionNo, m_LastTextAreaBreaks, m_LongName2BreakCollection, CHitBorders::CBreakCollection::m_ShortName, m_ShortName2BreakCollection, StringTokenizer::next_token(), PredefinedFileBreakName, PredefinedTableLineTag, PredefinedTextAreaBreakName, RegisterBreak(), Trim(), and StringTokenizer::val().
Referenced by CConcordance::LoadOptionsFromString().
bool CHitBorders::LoadHitBorders | ( | string | Path, |
bool | useMMap = false |
||
) |
load break collections from the disk
References GetPageBreaksFileName(), m_Breaks, m_PageBreaks, ddcVecFile< T >::open(), and CHitBorders::CBreakCollection::ReadFromDisk().
Referenced by ConcIndexatorInvoker::FinalizeIndex().
void CHitBorders::ConvertHitsToPageBreaks | ( | vector< CHit >::const_iterator | hits_begin, |
vector< CHit >::const_iterator | hits_end, | ||
const ddcBreakVector & | Breaks, | ||
DwordVector & | PageBreaks | ||
) | const |
converts hits to page breaks, which contains this breaks
References ddcVecFile< T >::begin(), ddcVecFile< T >::end(), and m_PageBreaks.
ddcVecFile< CPageNumber >::const_iterator CHitBorders::GetTokenPageBreak | ( | CTokenNo | tok | ) | const |
get page break for a given token number as an interator into m_PageBreaks
References ddcVecFile< T >::begin(), ddcVecFile< T >::end(), and m_PageBreaks.
void CHitBorders::AddBreakByName | ( | const string & | ShortName, |
const CTokenNo & | B | ||
) |
adds one break to a collection identified by a short name (during indexing)
References AddBreakByIndex(), and m_ShortName2BreakCollection.
Referenced by ConcIndexatorInvoker::IndexFile(), CConcIndexator::IndexMorphXml(), and CConcIndexator::IndexTextOrHtmlFile().
void CHitBorders::BordersEndIndexing | ( | string | Path | ) |
closes all CBreakCollectionDescr::m_FileForIndexing from m_Breaks (during indexing)
References CHitBorders::CBreakCollection::CloseFileForIndexing(), m_Breaks, and SavePageBreaks().
Referenced by ConcIndexatorInvoker::FinalizeIndex(), and CConcIndexator::TerminateIndexing().
void CHitBorders::StartTextAreaBorders | ( | ) |
must be called before indexing each text area in order to create at least on break in each text area
References m_Breaks, and m_LastTextAreaBreaks.
Referenced by CConcIndexator::IndexOneTableTextArea().
void CHitBorders::EndTextAreaBorders | ( | DWORD | TextAreaEndTokenNo | ) |
must be called after indexing each text area in order to create at least on break in each text area
References AddBreakByIndex(), m_FileBreakCollectionNo, and m_LastTextAreaBreaks.
Referenced by CConcIndexator::IndexOneTableTextArea().
|
protected |
all breaks
Referenced by AddBreakByIndex(), BordersEndIndexing(), CConcIndexator::CreateAsUnion(), GetBorderIndicesString(), GetBreakCollectionByName(), GetBreaksByName(), GetFileBreaks(), GetShortNameByName(), LoadHitBorders(), RegisterBorderIndices(), RegisterBreak(), RemoveHitBordersFileAndClear(), CConcIndexator::SplitProject(), StartIndexing(), and StartTextAreaBorders().
|
protected |
the map from CBreakCollection.m_ShortName to the index in m_Breaks
Referenced by AddBreakByName(), GetBorderIndicesString(), GetBreakCollectionIndexByName(), GetBreaksByName(), IsRegisteredBreak(), RegisterBorderIndices(), and RegisterBreak().
|
protected |
the map from CBreakCollection.m_LongName to the index in m_Breaks
Referenced by GetBreakCollectionIndexByName(), RegisterBorderIndices(), and RegisterBreak().
|
protected |
a quick reference to file breaks (which are also stored in m_Breaks)
Referenced by CHitBorders(), CConcIndexator::CreateAsUnion(), EndTextAreaBorders(), GetCorpusEndTokenNo(), GetFileBreaks(), GetFileStartTokenNo(), and RegisterBorderIndices().
|
protected |
The name of the default break collection (written in the options file)
Referenced by GetBorderIndicesString(), RegisterBorderIndices(), and WithinBreakName().
|
protected |
page number collection
Referenced by AddPageBreak(), ConvertHitsToPageBreaks(), CConcIndexator::CreateAsUnion(), GetPageNumber(), GetTokenPageBreak(), LoadHitBorders(), RemoveHitBordersFileAndClear(), SavePageBreaks(), CConcIndexator::SplitProject(), and StartIndexing().
|
protected |
Referenced by AddBreakByIndex(), EndTextAreaBorders(), RegisterBorderIndices(), RegisterBreak(), and StartTextAreaBorders().