Go to the documentation of this file.00001 #ifndef _ConcBigrams_h
00002 #define _ConcBigrams_h
00003
00004 #include "IndexSetForQueryingStage.h"
00005
00006 struct CSecondWord
00007 {
00008
00009 DWORD m_TheOnlyOccur : 1;
00010 DWORD m_EndOfSentenceFlag : 1;
00011 DWORD m_Distance : 2;
00012 DWORD m_StartOccur: 28 ;
00013
00014 DWORD m_SecondWordId;
00015
00016 DWORD m_EndOccurOffset;
00017
00018 void InitWithZeros()
00019 {
00020 m_TheOnlyOccur = 0;
00021 m_EndOfSentenceFlag = 0;
00022 m_Distance = 0;
00023 m_StartOccur = 0;
00024 m_EndOccurOffset = 0;
00025 m_SecondWordId = 0;
00026
00027 }
00028 DWORD PackFlagsToDWORD () const
00029 {
00030 return (m_TheOnlyOccur << 31) | (m_EndOfSentenceFlag << 30) | (m_Distance << 28) | m_StartOccur;
00031 }
00032 void InitFlagsFromDWORD (const DWORD & v)
00033 {
00034 m_TheOnlyOccur = v >> 31 ;
00035 m_EndOfSentenceFlag = v >> 30;
00036 m_Distance = v >> 28 ;
00037 m_StartOccur = (v << 4) >> 4;
00038 }
00039
00040 };
00041
00042 class CIndexBigram
00043 {
00044 void CloseFiles();
00045
00046 public:
00047 string m_Path ;
00048 vector<DWORD> m_FirstWords;
00049 FILE* m_SecondWordFile;
00050
00051 CIndexBigram(string Path);
00052 ~CIndexBigram();
00053 string GetSecondWordFileName() const;
00054 string GetFirstWordFileName() const;
00055 bool LoadBigrams();
00056 bool RemoveFiles();
00057 size_t ReadSecondWords(size_t FirstWord, vector<CSecondWord>& SecondWords) const;
00058
00059
00060
00061
00062 };
00063
00064 class CIndexSetForBigrams : public CIndexSetForQueryingStage
00065 {
00067
00068
00069 void WriteBigramAndItsOccurs (CSecondWord& I, const DWORD BigramId, CItemIndexForLoading& M, FILE* res_fp, size_t& CurrPositionInResFile, FILE* second_word_fp);
00070 bool CreateRightBigrams(string TempRightBigramsFileName);
00071 string GetRightToLeftPerdiv() const;
00072 map<DWORD, DWORD> m_RightToLeftPerdivIds;
00073 public:
00074 CIndexBigram m_LeftBigrams;
00075 CIndexBigram m_RightBigrams;
00076
00077
00078
00079
00081 CIndexSetForBigrams(const CStringIndexator* pParent);
00082 virtual ~CIndexSetForBigrams();
00083 string GetName() const;
00085 bool DestroyIndexSet ();
00086
00087 bool LoadIndexSet();
00088
00090 bool ConvertTempBigramsToPersistent(string TempBigramsFileName, const vector<CIndexItem>& MainTokenIndex);
00091
00093 void FindOccurrencesForBigrams (const BigramDirectionEnum bigr_direc, const vector<DWORD>& IndexItems, const size_t PeriodNo, vector<pair<CTokenNo,DWORD> >& occurrences, CShortOccurCacheMap* pCaches, vector<int>& CacheIds) const;
00095 void ReadAllOccurrences (size_t IndexItemNo, const CSecondWord& I, vector<CTokenNo>& Occurs) const;
00096
00097
00098 };
00099
00100 #endif