ddc
|
#include <ConcXml.h>
Public Member Functions | |
CConcXml () | |
~CConcXml () | |
void | DeleteFiles () |
bool | Start (string ProjectFileName) |
initialize building bibliography for corpus ProjectFileName More... | |
bool | AddIndexItem (const CBibliography &Bibliography) |
add one record Bibliography More... | |
void | FinalSaveBibliography () |
save all indices and stop indexing bibliography More... | |
void | ExitWithoutSave () |
stop indexing bibliography an removes index files More... | |
bool | UnionBibliographies (const vector< const CConcXml *> &Bibls) |
union of multiple (heterogeneous) bibliographies (new) More... | |
bool | SplitBibliography (vector< CConcXml *> &Bibls, const vector< DWORD > &EndFileNo) const |
partition this object into multiple sub-objects (called by CConcIndexator::SplitProject()) More... | |
void | LoadXmlAndReadBibliography (TiXmlDocument &doc, const char *pFileBuffer, CBibliography &Bibl) |
load xml file into TiXmlDocument& doc and load bibliographical fileds to CBibliography& Bibl More... | |
void | ReadMorphXmlFileIntoGraTable (string FileName, const char *pFileBuffer, vector< CXmlToken > &GraTable, CBibliography &Bibl) |
load xml file under MorphXML_Index into vector<CXmlToken>& GraTable More... | |
void | SetFreeBiblAttribsEmpty (CBibliography &Bibl) const |
set all free bibliographical attributes to "" More... | |
void | SetFreeBiblByName (const string &name, const string &value, CBibliography &result) const |
Public Member Functions inherited from CBiblIndex | |
CBiblIndex () | |
void | FreeBiblIndices () |
clears m_FreeBiblIndices More... | |
void | FreeBiblExpanders () |
clears m_BiblExpanders More... | |
bool | RegisterFreeBiblAttributes (string fields, string &ErrorStr) |
initializes free bibliographical attribute descriptions More... | |
bool | RegisterTextAreas (string fields, string &ErrorStr) |
initializes free text areas descriptions More... | |
string | GetFreeBibiAttributesDescr () const |
return free bibliographical attribute description More... | |
string | GetTextAreasDescr () const |
return full text area description More... | |
void | SetPath (string ProjectFileName) |
void | LoadBibl (string Path, size_t FileBreaksSize, bool useMMap=false) |
CBibliography | GetFullBibliographyOfHit (size_t FileNo) const |
returns the bibliographical record More... | |
string | GetBiblIndexFileName () const |
string | GetBiblFileName () const |
string | GetBiblDateIndexFileName () const |
int | GetTextAreaByName (const string &Name) const |
CFreeBiblIndex * | GetFreeBiblIndex (const string &FreeBiblAttribNameOrAlias) const |
moo: not quite as ugly or dangerous a hack (respects aliases) More... | |
CBiblExpander * | GetBiblExpander (const string &ExpanderName) const |
moo: not quite as ugly or dangerous a hack More... | |
bool | GetFilterBounds (CDDCFilterWithBounds &Filter, const string &LoValue, const string &HiValue) const |
bool | GetFilterValue (CDDCFilterWithBounds &Filter, const string &Value) const |
bool | GetFilterValues (CDDCFilterWithBounds &Filter, const string &Regex) const |
bool | GetFilterValues (CDDCFilterWithBounds &Filter, const set< string > &Values) const |
void | GetTextAreaElements (const TiXmlDocument &doc, vector< TiXmlElement *> &Result) const |
return all text area elements for this document ("doc") More... | |
size_t | GetTextAreasCount () const |
return all text area names More... | |
int | WithinTextArea (const vector< string > &Within) const |
void | InitNoSort (vector< CHit > &Hits) const |
void | InitSortByDate (vector< CHit > &Hits) const |
void | InitSortByBiblIntegerField (string FreeBiblAttribNameOrAlias, vector< CHit > &Hits) const |
bool | IsRegisteredBiblField (const string &FreeBiblAttribNameOrAlias) const |
CFreeBiblIndexTypeId | GetBiblFieldTypeId (const string &FreeBiblAttribName) const |
string | FreeBiblMapToJson (bool useUtf) const |
print to json More... | |
string | FreeBiblAliasMapToJson () const |
string | BiblExpanderMapToJson () const |
const ddcDateVector & | GetDates () const |
moo: ugly dangerous hack More... | |
bool | HasFreeBiblIndex (const string &name) const |
not quite as ugly or dangerous a hack (respects aliases) More... | |
bool | HasBiblExpander (const string &ExpanderName) const |
moo: not quite as ugly or dangerous a hack More... | |
const CFreeBiblIndexInterface * | GetFreeBiblIndexConst (const string &name) const |
moo: not quite as ugly or dangerous a hack (respects aliases) More... | |
void | SetRegexOptions (const RML_RE::Options &opts) |
set regex options for all registered fields More... | |
string | GetVisibleFreeHeaderBiblAttributes (size_t FileNo, string Delim) const |
return values of all visible free bibliographical attributes for the given FileNo delimited by "Delim" More... | |
string | GetFreeHeaderBiblAttributesWithNames (size_t FileNo, char Delim) const |
return names and values of all free bibliographical attributes for the given FileNo delimited by "Delim" More... | |
string | GetFreeHeaderBiblAttributesJson (size_t FileNo, bool assume_utf8=true, bool include_invisible=false) const |
return names and values of all free bibliographical attributes for the given FileNo as JSON (without enclosing {}) More... | |
string | GetFreeHeaderBiblAttributesTabsDump (size_t FileNo, bool assume_utf8=true, bool include_invisible=false) const |
return names and values of all free bibliographical attributes for the given FileNo as tt-comments (for tt-mode dump) More... | |
bool | FileMatches (DWORD FileNo, const vector< CDDCFilterWithBounds > &Filters) const |
test whether all compiled Filters match FileNo More... | |
CBiblExpander * | AddBiblExpander (const string &spec) |
add a new bibliographic expander to m_BiblExpanders, or replace an existing one; returns new expander More... | |
Private Attributes | |
FILE * | m_BodyFileToBuild |
size_t | m_BodyFileToBuildSize |
Additional Inherited Members | |
Public Attributes inherited from CBiblIndex | |
string | m_DefaultAttrName |
name of default bibliographic field to query if no literal match is found This can be used in conjunction with a constant bibliographic metadata attribute (CConcXml::CFreeBiblStringConstant) to provide a default value for unknown bibliographic metadata attributes, e.g. to facilitate interoperability between multiple corpora. If set to the empty string (the default), query filters on an undefined bibliographic attribute will raise an error. More... | |
Protected Types inherited from CBiblIndex | |
typedef map< string, CFreeBiblIndex * > | FreeBiblStringMap |
typedef map< string, string > | FreeBiblAliasMap |
typedef map< string, CBiblExpander * > | BiblExpanderMap |
Protected Attributes inherited from CBiblIndex | |
FreeBiblStringMap | m_FreeBiblIndices |
FreeBiblAliasMap | m_FreeBiblAlias |
BiblExpanderMap | m_BiblExpanders |
map< string, size_t > | m_FreeBiblNameToPosition |
ddcVecFile< file_off_t > | m_EndOffsetsInBiblFile |
ddcDateVector | m_Dates |
string | m_Path |
string | m_OrigXPath |
string | m_ScanXPath |
string | m_DateXPath |
string | m_StartPageXPath |
CConcXml is used for building bibliographical indices and working with xml.
CConcXml::CConcXml | ( | ) |
References m_BodyFileToBuild.
CConcXml::~CConcXml | ( | ) |
References CBiblIndex::FreeBiblExpanders(), CBiblIndex::FreeBiblIndices(), and m_BodyFileToBuild.
void CConcXml::DeleteFiles | ( | ) |
References CBiblIndex::GetBiblDateIndexFileName(), CBiblIndex::GetBiblFileName(), CBiblIndex::GetBiblIndexFileName(), CBiblIndex::m_FreeBiblIndices, and CBiblIndex::m_Path.
Referenced by ExitWithoutSave().
bool CConcXml::Start | ( | string | ProjectFileName | ) |
initialize building bibliography for corpus ProjectFileName
References ddcVecFile< T >::clear(), CBiblIndex::GetBiblFileName(), m_BodyFileToBuild, m_BodyFileToBuildSize, CBiblIndex::m_Dates, CBiblIndex::m_EndOffsetsInBiblFile, CBiblIndex::m_FreeBiblIndices, and CBiblIndex::SetPath().
Referenced by CConcIndexator::StartIndexing().
bool CConcXml::AddIndexItem | ( | const CBibliography & | Bibliography | ) |
add one record Bibliography
References CBibliography::ConvertDateToInt(), CBibliography::m_BiblAttribs, m_BodyFileToBuild, m_BodyFileToBuildSize, CBiblIndex::m_Dates, CBiblIndex::m_EndOffsetsInBiblFile, CBiblIndex::m_FreeBiblIndices, ddcVecFile< T >::m_vec, and CBibliography::WriteToString().
Referenced by CConcIndexator::IndexFreeIndex(), CConcIndexator::IndexMorphXml(), and CConcIndexator::IndexTextOrHtmlFile().
void CConcXml::FinalSaveBibliography | ( | ) |
save all indices and stop indexing bibliography
References ddcVecFile< T >::ensureVec(), FileExists(), Format(), CBiblIndex::GetBiblDateIndexFileName(), CBiblIndex::GetBiblIndexFileName(), m_BodyFileToBuild, m_BodyFileToBuildSize, CBiblIndex::m_Dates, CBiblIndex::m_EndOffsetsInBiblFile, CBiblIndex::m_FreeBiblIndices, CBiblIndex::m_Path, ddcVecFile< T >::m_vec, and WriteVector().
Referenced by CConcIndexator::NormalEndIndexing().
void CConcXml::ExitWithoutSave | ( | ) |
stop indexing bibliography an removes index files
References DeleteFiles(), m_BodyFileToBuild, and m_BodyFileToBuildSize.
Referenced by CConcIndexator::TerminateIndexing().
bool CConcXml::UnionBibliographies | ( | const vector< const CConcXml *> & | Bibls | ) |
union of multiple (heterogeneous) bibliographies (new)
References ddcVecFile< T >::clear(), CFreeBiblIndex::CreateUnion(), ddcLogInfo, ddcLogWarn, ddcVecFile< T >::ensureData(), ddcVecFile< T >::ensureVec(), FileAppend(), FileExists(), FileSize(), Format(), CBiblIndex::GetBiblDateIndexFileName(), CBiblIndex::GetBiblFileName(), CBiblIndex::GetBiblIndexFileName(), CBiblIndex::m_Dates, CBiblIndex::m_EndOffsetsInBiblFile, CBiblIndex::m_FreeBiblIndices, CBiblIndex::m_Path, ddcVecFile< T >::m_vec, ddcVecFile< T >::save(), and ddcVecFile< T >::size().
Referenced by CConcIndexator::CreateAsUnion().
bool CConcXml::SplitBibliography | ( | vector< CConcXml *> & | Bibls, |
const vector< DWORD > & | EndFileNo | ||
) | const |
partition this object into multiple sub-objects (called by CConcIndexator::SplitProject())
References ddcVecFile< T >::begin(), CFreeBiblIndex::CreatePartitions(), ddcLogInfo, ddcLogTrace, ddcLogWarn, FileAppendPartial(), FileSize(), Format(), FSeek(), CBiblIndex::GetBiblDateIndexFileName(), CBiblIndex::GetBiblFileName(), CBiblIndex::GetBiblIndexFileName(), CBiblIndex::GetFreeBiblIndex(), CBiblIndex::m_Dates, CBiblIndex::m_EndOffsetsInBiblFile, CBiblIndex::m_FreeBiblIndices, and ddcVecFile< T >::m_vec.
Referenced by CConcIndexator::SplitProject().
void CConcXml::LoadXmlAndReadBibliography | ( | TiXmlDocument & | doc, |
const char * | pFileBuffer, | ||
CBibliography & | Bibl | ||
) |
load xml file into TiXmlDocument& doc and load bibliographical fileds to CBibliography& Bibl
References CBibliography::CleanBibliography(), CBibliography::ConvertDateToInt(), TiXmlDocument::Error(), TiXmlDocument::ErrorCol(), TiXmlDocument::ErrorDesc(), TiXmlDocument::ErrorRow(), Format(), CBibliography::m_BiblAttribs, CBibliography::m_DateStr, CBiblIndex::m_DateXPath, CBiblIndex::m_FreeBiblIndices, CBibliography::m_OrigBibl, CBiblIndex::m_OrigXPath, CBibliography::m_ScanBibl, CBiblIndex::m_ScanXPath, CBibliography::m_StartPageInfo, CBiblIndex::m_StartPageXPath, CFreeBiblIndex::m_Xpath, TiXmlDocument::Parse(), ReadXmlField(), CBibliography::Sanitize(), and UnknownPageNumber.
Referenced by CConcIndexator::IndexFreeIndex(), CConcIndexator::LoadXmlFile(), and ReadMorphXmlFileIntoGraTable().
void CConcXml::ReadMorphXmlFileIntoGraTable | ( | string | FileName, |
const char * | pFileBuffer, | ||
vector< CXmlToken > & | GraTable, | ||
CBibliography & | Bibl | ||
) |
load xml file under MorphXML_Index into vector<CXmlToken>& GraTable
References TiXmlNode::FirstChild(), TiXmlNode::FirstChildElement(), Format(), CBiblIndex::GetTextAreaElements(), LoadXmlAndReadBibliography(), CXmlToken::m_Annots, CXmlMorphAnnot::m_GrammemsStr, CXmlMorphAnnot::m_Lemma, CXmlToken::m_Type, CXmlToken::m_WordStr, TiXmlNode::NextSiblingElement(), Trim(), and TiXmlNode::Value().
Referenced by CConcIndexator::IndexMorphXml().
void CConcXml::SetFreeBiblAttribsEmpty | ( | CBibliography & | Bibl | ) | const |
set all free bibliographical attributes to ""
References CBibliography::m_BiblAttribs, and CBiblIndex::m_FreeBiblIndices.
Referenced by CConcIndexator::LoadFileIntoGraphan().
void CConcXml::SetFreeBiblByName | ( | const string & | name, |
const string & | value, | ||
CBibliography & | result | ||
) | const |
References Format(), CBibliography::m_BiblAttribs, and CBiblIndex::m_FreeBiblNameToPosition.
|
private |
Referenced by AddIndexItem(), CConcXml(), ExitWithoutSave(), FinalSaveBibliography(), Start(), and ~CConcXml().
|
private |
Referenced by AddIndexItem(), ExitWithoutSave(), FinalSaveBibliography(), and Start().