Go to the documentation of this file.00001 #ifndef __ConcCommon_H_
00002 #define __ConcCommon_H_
00003
00011 #include "../common/utilit.h"
00012 #include "list"
00013 #include "limits.h"
00014
00015
00016 #include "../GraphanLib/GraphmatFile.h"
00017 #include "../LemmatizerLib/Lemmatizers.h"
00018
00019 #include "../AgramtabLib/EngGramTab.h"
00020 #include "../AgramtabLib/RusGramTab.h"
00021 #include "../AgramtabLib/GerGramTab.h"
00022 #include "../common/DDC_common.h"
00023 #include "../tinyxml/tinyxml.h"
00024
00025
00026
00027
00029 const char globalFieldDelimeter = '\t';
00031 const string PredefinedTableLineTag = "l";
00033 const string ChunkIndexName = "chunk";
00035 const string LeftBigramsIndexName = "left";
00037 const string RightBigramsIndexName = "right";
00039 const string PredefinedFileBreakName = "file";
00041 const string PredefinedTextAreaBreakName = "textarea";
00042
00044 typedef DWORD CTokenNo;
00045
00047 const size_t MaxShortOccurCacheSize = 1000000;
00048
00049
00050
00059 class CShortOccurCache
00060 {
00062 struct CDataReference
00063 {
00064
00065 size_t m_VectorStartOffset;
00066
00067 size_t m_VectorLength;
00068
00069
00070 CDataReference (size_t VectorStartOffset, size_t VectorLength)
00071 {
00072 m_VectorStartOffset = VectorStartOffset;
00073 m_VectorLength = VectorLength;
00074 };
00075 };
00077
00078 vector <CDataReference> m_IndexItemNo2Occurrences;
00079
00081 vector<CTokenNo> m_OccurrencesBody;
00082 public:
00083 void Clear();
00084 size_t AddNewIndexItemNoToCache(const CTokenNo* pStart, const CTokenNo* pEnd );
00085 const CTokenNo* GetOccurrencesFromCache(const int CacheId, DWORD& Length) const;
00086 bool CouldContainMore() const;
00087
00088 };
00089
00093 enum HitSortEnum {
00094
00096 NoSort = 0,
00098 LessByDate = 1,
00100 GreaterByDate = 2,
00102 LessBySize = 3,
00104 GreaterBySize = 4,
00106 LessByFreeBiblField = 5,
00108 GreaterByFreeBiblField = 6,
00110 LessByRank = 7,
00112 GreaterByRank = 8,
00114 LessByLeftContext = 9,
00116 LessByRightContext = 10,
00117 HitSortsCount = 11,
00118 };
00119
00120
00128 struct CDDCFilterWithBounds
00129 {
00131 HitSortEnum m_FilterType;
00133 string m_FreeBiblAttribName;
00134
00136 bool m_bRegExp;
00138 int m_LevelStart;
00140 int m_LevelEnd;
00142 set<int> m_SatisfiedValues;
00143
00144 CDDCFilterWithBounds()
00145 {
00146 m_bRegExp = false;
00147 m_FilterType = HitSortsCount;
00148 m_LevelStart = INT_MIN;
00149 m_LevelEnd = INT_MAX;
00150 }
00151
00152 };
00153
00157 struct CHit
00158 {
00160 DWORD m_BreakNo;
00162 DWORD m_HighlightOccurrenceEnd;
00164 int m_OrderId;
00166 DWORD m_FileNo;
00168 size_t m_DebugRankNo;
00169
00170 CHit(DWORD BreakNo) : m_BreakNo(BreakNo), m_OrderId(0) {};
00171
00172 bool operator < (const CHit& X) const
00173 {
00174 return m_OrderId < X.m_OrderId;
00175 };
00176 };
00177
00178
00179
00181 typedef map<size_t, vector<DWORD> > PeriodsDivisionMap;
00182
00184 typedef map<string,CShortOccurCache> CShortOccurCacheMap;
00185
00187 typedef vector<CTokenNo> COccurrBuffer;
00188
00190 const string MorphAnnotationsDelim = "#";
00192 const string MorphAnnotationsDelimRegExp = "[^#]*";
00193
00195 extern bool InitConcordDicts();
00197 extern void FreeConcordDicts();
00199 extern const CLemmatizer* GetLemmatizerByLanguage (MorphLanguageEnum Langua);
00201 extern const CAgramtab* GetGramtabByLanguage (MorphLanguageEnum Langua);
00203 extern void concord_daemon_log(const string& t);
00205 extern string GetDDCErrorString(DDCErrorEnum ErrorCode);
00206
00207
00208 enum BigramDirectionEnum{
00209 bdDontUseBigrams,
00210 bdLeftBigram,
00211 bdRightBigram
00212 };
00213
00214 #endif