Go to the documentation of this file.00001
00002 #ifndef IndexSet_h
00003 #define IndexSet_h
00004
00005 #include "../ConcordLib/IndexSetForLoadingStage.h"
00006 #include "../ConcordLib/IndexSetForQueryingStage.h"
00007 #include "../ConcordLib/ConcBigrams.h"
00008 #include "../PCRE/pcre_rml.h"
00009
00012 struct COutputToken
00013 {
00015 string m_TokenStr;
00017 string m_InterpStr;
00019 bool m_bHighlight;
00020
00021 COutputToken () {
00022 m_bHighlight = false;
00023 };
00024
00025 COutputToken(string WordStr, bool bHighlight)
00026 {
00027 m_TokenStr = WordStr;
00028 m_bHighlight = bHighlight;
00029 };
00030
00031 bool operator == (const COutputToken& X)const
00032 {
00033 return m_TokenStr == X.m_TokenStr;
00034 };
00035 };
00036
00037
00038
00039 class CStringIndexator;
00040
00041
00047 class CStringIndexSet : public CIndexSetForLoadingStage, public CIndexSetForQueryingStage
00048 {
00050 FILE* m_StorageFile;
00051
00052
00053
00054
00056 bool ConvertLoadIndexToWorkingIndex ();
00058 string GetStorageFileName() const;
00060 string GetLeftBigramsFileName() const;
00062 bool CreateUnionTokenStorage(const CStringIndexSet& I1, const CStringIndexSet& I2, const map<DWORD, DWORD>& First2Result, const map<DWORD, DWORD>& Second2Result);
00064 bool SaveOnePartOfUnionTokenStorage(FILE * res_fp, const map<DWORD, DWORD>& Old2New) const;
00066 bool OpenStorageFile();
00068 void CloseStorageFile();
00070 string GetName() const;
00072 bool ConvertTempStorageToPersistent();
00073 bool DumpBigramsOfOneDirection ( BigramDirectionEnum bigram_direc ) const;
00074
00075 public:
00076
00078 string m_Name;
00079
00081 string m_ShortName;
00082
00083 CIndexSetForBigrams m_BigramsIndex;
00084
00086 template<class T>
00087 const char* GetIndexItemStr(const T& W) const
00088 {
00089 return &m_StringBuffer[0] + W.GetIndexItemOffset();
00090 };
00091
00092 CStringIndexSet(const CStringIndexator* pParent);
00093 ~CStringIndexSet();
00094
00096 void InitIndexSet(string Name, string ShortName, bool bCreateItemStorage, bool bCompress);
00098 bool ReadFromTheDisk();
00100 bool DestroyIndexSet();
00102 bool WriteToFile(bool bAfterLoading);
00104 void UnionIndexSet(const CStringIndexSet& I1, const CStringIndexSet& I2, const CTokenNo EndToken1, const CTokenNo EndToken2);
00106 bool GetTokensFromStorage(const size_t start_offset, const size_t end_offset, vector<COutputToken>& Tokens) const;
00108 void FindOccurrences (const vector<DWORD>& IndexItems, const size_t PeriodNo, vector<CTokenNo>& occurrences, CMyTimeSpanHolder& Profilerp, CShortOccurCacheMap* pCaches, vector<int>& CacheIds) const;
00110 void FindChunkOccurrences (const vector<DWORD>& IndexItems, vector<CTokenNo>& occurrences, vector<size_t>& ChunkLengths, size_t PeriodNo, CMyTimeSpanHolder& Profilerp, CShortOccurCacheMap* pCaches, vector<int>& CacheIds) const;
00112 void QueryTokenList (const string& WordForm, vector<DWORD>& MatchWords) const;
00114 void QueryTokenListWithRightTruncation (const string& WordForm, vector<DWORD>& MatchWords) const;
00116 void QueryTokenListUsingRegExp (RML_RE &RegExp, vector<DWORD>& MatchWords) const;
00118 bool DumpStorage() const;
00120 bool DumpBigrams() const;
00121 };
00122
00123
00124
00125
00126 #endif