ddc: ConcCommon.h Source File

Go to the documentation of this file.
00001 #ifndef __ConcCommon_H_
00002 #define __ConcCommon_H_
00003 
00011 #include "../common/utilit.h"
00012 #include "list"
00013 #include "limits.h"
00014 
00015 
00016 #include "../GraphanLib/GraphmatFile.h"
00017 #include "../LemmatizerLib/Lemmatizers.h"
00018 
00019 #include "../AgramtabLib/EngGramTab.h"
00020 #include "../AgramtabLib/RusGramTab.h"
00021 #include "../AgramtabLib/GerGramTab.h"
00022 #include "../common/DDC_common.h"
00023 #include "../tinyxml/tinyxml.h"
00024 
00025 
00026 
00027 
00029 const char globalFieldDelimeter = '\t';
00031 const string PredefinedTableLineTag = "l";
00033 const string ChunkIndexName = "chunk";
00035 const string LeftBigramsIndexName = "left";
00037 const string RightBigramsIndexName = "right";
00039 const string PredefinedFileBreakName = "file";
00041 const string PredefinedTextAreaBreakName = "textarea";
00042 
00044 typedef DWORD CTokenNo;
00045 
00047 const size_t MaxShortOccurCacheSize = 1000000;
00048 
00049 
00050 
00059 class CShortOccurCache 
00060 {
00062         struct CDataReference 
00063         {
00064                 // the start index of subvector in m_OccurrencesBody
00065                 size_t m_VectorStartOffset;
00066                 // the length of subvector in m_OccurrencesBody
00067                 size_t m_VectorLength;
00068 
00069 
00070                 CDataReference (size_t VectorStartOffset, size_t VectorLength)
00071                 {
00072                         m_VectorStartOffset = VectorStartOffset;
00073                         m_VectorLength = VectorLength;
00074                 };
00075         };
00077         //map<CTokenNo, CDataReference> m_IndexItemNo2Occurrences;
00078         vector <CDataReference> m_IndexItemNo2Occurrences;
00079 
00081         vector<CTokenNo>                        m_OccurrencesBody;
00082 public:
00083         void                            Clear();
00084         size_t                          AddNewIndexItemNoToCache(const CTokenNo* pStart, const CTokenNo* pEnd );
00085         const CTokenNo*         GetOccurrencesFromCache(const int CacheId, DWORD& Length) const;
00086         bool                            CouldContainMore() const;
00087 
00088 };
00089 
00093 enum HitSortEnum {
00094 
00096         NoSort = 0,
00098         LessByDate = 1,
00100         GreaterByDate = 2,
00102         LessBySize = 3,
00104         GreaterBySize = 4,
00106         LessByFreeBiblField = 5,
00108         GreaterByFreeBiblField = 6,
00110         LessByRank = 7,
00112         GreaterByRank = 8,
00114         LessByLeftContext = 9,
00116         LessByRightContext = 10,
00117         HitSortsCount = 11,
00118 };
00119 
00120 
00128 struct CDDCFilterWithBounds 
00129 {
00131         HitSortEnum             m_FilterType;
00133         string                  m_FreeBiblAttribName;
00134 
00136         bool                    m_bRegExp;
00138         int                             m_LevelStart;
00140         int                             m_LevelEnd;
00142         set<int>                m_SatisfiedValues;  
00143 
00144         CDDCFilterWithBounds() 
00145         {
00146                 m_bRegExp = false;
00147                 m_FilterType = HitSortsCount;
00148                 m_LevelStart = INT_MIN;
00149                 m_LevelEnd = INT_MAX; 
00150         }
00151 
00152 };
00153 
00157 struct CHit 
00158 {
00160         DWORD   m_BreakNo;
00162         DWORD   m_HighlightOccurrenceEnd;
00164         int             m_OrderId;
00166         DWORD   m_FileNo;
00168         size_t  m_DebugRankNo;
00169 
00170         CHit(DWORD BreakNo) : m_BreakNo(BreakNo), m_OrderId(0) {};
00171 
00172         bool operator < (const CHit& X) const 
00173         {
00174                 return  m_OrderId < X.m_OrderId;
00175         };
00176 };
00177 
00178 
00179 
00181 typedef map<size_t, vector<DWORD> > PeriodsDivisionMap;
00182 
00184 typedef map<string,CShortOccurCache> CShortOccurCacheMap;
00185 
00187 typedef vector<CTokenNo> COccurrBuffer;
00188 
00190 const string MorphAnnotationsDelim = "#";
00192 const string MorphAnnotationsDelimRegExp = "[^#]*";
00193 
00195 extern bool InitConcordDicts();
00197 extern void FreeConcordDicts();
00199 extern const CLemmatizer* GetLemmatizerByLanguage (MorphLanguageEnum Langua);
00201 extern const CAgramtab* GetGramtabByLanguage (MorphLanguageEnum Langua);
00203 extern void concord_daemon_log(const string&  t);
00205 extern string GetDDCErrorString(DDCErrorEnum ErrorCode);
00206 
00207 
00208 enum BigramDirectionEnum{
00209      bdDontUseBigrams,
00210      bdLeftBigram,
00211      bdRightBigram
00212 };
00213 
00214 #endif