Go to the documentation of this file.00001
00009 #ifndef ConcIndexator_h
00010 #define ConcIndexator_h
00011
00012 #if _MSC_VER > 1000
00013 #pragma once
00014 #endif // _MSC_VER > 1000
00015
00016
00017 #include "../ConcordLib/ConcCommon.h"
00018 #include "../ConcordLib/Bibliography.h"
00019 #include "../ConcordLib/StringIndexator.h"
00020 #include "../ConcordLib/HitBorder.h"
00021
00022
00023
00024
00025 struct CHighlightTags
00026 {
00028 bool m_bWasReadFromString;
00030 string m_FirstOpener;
00032 string m_FirstCloser;
00034 string m_RestOpener;
00036 string m_RestCloser;
00037 CHighlightTags();
00038 bool ReadFromString(const string& s);
00039 string ToString() const;
00040
00041 };
00042
00043
00048 class CSourceFileHolder
00049 {
00051 vector<string> m_SourceFiles;
00052 bool m_bModifiedListOfFiles;
00053
00054 public:
00055 CSourceFileHolder();
00057 bool SaveSourceFileList(string FileName);
00058
00060 void DeleteSourceFile(long ItemNo);
00061
00063 void AddSourceFile(const char* FileName);
00064
00066 void DeleteAllSourceFiles();
00067
00069 size_t GetSourceFilesCount() const;
00070
00072 string GetSourceFile(size_t FileNo) const;
00073
00074 void AddSourceFilesFrom(const CSourceFileHolder& X);
00075
00076 bool ReadSourceFileList(string FileName);
00077
00079 int FoundNotExistedFile ( ) const;
00080
00081
00082 bool IsModified ( ) const;
00083
00084 };
00085
00086
00087 class CDwdsThesaurus;
00094 class CConcIndexator : public CStringIndexator, public CHitBorders, public CSourceFileHolder
00095 {
00099 enum DDCIndexTypeEnum {
00104 DWDS_Index,
00108 MorphXML_Index,
00116 Free_Index
00117 };
00118
00120 vector<BYTE> m_PcreCharacterTables;
00121
00123 bool m_bUseParagraphTagToDivide;
00125 bool m_bEmptyLineIsSentenceDelim;
00127 bool m_bUseIndention;
00129 bool m_bDwdsCorpusInterface;
00131 bool m_bGutenbergInterface;
00133 bool m_bNoContextOperator;
00135 DWORD m_UserMaxTokenCountInOnePeriod;
00136 bool m_bUserMaxTokenCountInOnePeriod;
00138 bool m_bUseDwdsThesaurus;
00140 bool m_bOutputBibliographyOfHits;
00142 bool m_bIndexPunctuation;
00144 DDCIndexTypeEnum m_IndexType;
00145
00146
00147 string m_InternetPathPrefix;
00148 string m_LocalPathPrefix;
00149 string m_CommonFilePrefix;
00150
00151
00152 bool IndexTextOrHtmlFile ( CGraphmatFile* piGraphmat, string FileName, const char* pFileBuffer, const CDwdsThesaurus* pDwdsThesaurus, CTokenNo& NewCorpusEndTokenNo,string& strError);
00153 bool IndexMorphXml (string FileName, const char* pFileBuffer, CTokenNo& NewCorpusEndTokenNo, string& strError);
00154 bool IndexTable(string FileName, const char* pFileBuffer, CTokenNo& NewCorpusEndTokenNo, string& strError);
00155 bool IndexOneTableTextArea(const string& Text, const CPageNumber& StartPageFromHeader, size_t& page_breaks_count, CTokenNo& NewCorpusEndTokenNo, string& strError);
00156
00157 void AssertHasPath() const;
00158 string GetBiblIndexFileName() const;
00159 string GetBiblFileName() const;
00160
00161
00163 string SaveOptionsToString() const;
00165 bool LoadOptionsFromString(string Options);
00167 bool IsDWDSToken (const CGraphmatFile* piGraphmat, long GraLine) const;
00169 bool HasEqualOptions(const CConcIndexator& X) const;
00170
00172 const char* GetIndexTypeStr () const;
00174 bool ReadIndexTypeFromStr (const string& s);
00175 bool LoadXmlFile(string FileName, const char* pFileBuffer, CGraphmatFile* piGraphmat, CBibliography& Bibl, string& strError);
00176 bool LoadFileIntoGraphan(string FileName, const char* pFileBuffer, CGraphmatFile* piGraphmat, CBibliography& Bibl, string& strError);
00177 void InitDefaultOptions();
00178 public:
00180 MorphLanguageEnum m_Language;
00182 bool m_bIndexMorphPatterns;
00184 bool m_bIndexChunks;
00186 bool m_bCaseSensitive;
00188 bool m_bShowNumberOfRelevantDocuments;
00190 bool m_bQueryOnlyFiles;
00192 bool m_bArchiveIndex;
00194 bool m_bResumeOnIndexErrors;
00196 vector<string> m_CorpusFiles;
00198 set<size_t> m_MaskedFiles;
00200 CConcXml m_Bibl;
00202 CHighlightTags m_HtmlHighlighting;
00204 CHighlightTags m_TextHighlighting;
00206 bool m_bDisableDefaultQueryLexicalExpansion;
00208 int m_LeftKwicContextSize;
00210 int m_RightKwicContextSize;
00212 int m_NumberOfKwicLinesInSnippets;
00214 double m_TfIdfRank;
00216 double m_NearRank;
00218 double m_PositionRank;
00220 string m_InterpDelimiter;
00222 string m_TokenDelimiter;
00224 bool m_Utf8;
00226 vector<size_t> m_IndicesToShow;
00227
00228
00229 CConcIndexator();
00230 ~CConcIndexator();
00231
00232
00233
00235
00236
00237
00239
00240
00241 RML_RE::Options GetRegexOptions() const
00242 {
00243 if (m_Utf8)
00244 return RML_RE::Options(pcrecpp::UTF8(), NULL);
00245 else
00246 return RML_RE::Options(pcrecpp::RE_Options(), m_PcreCharacterTables.data());
00247 };
00248
00250 bool IsDwdsCorpusInterface() const { return m_bDwdsCorpusInterface; };
00252 bool IsGutenbergInterface() const { return m_bGutenbergInterface; };
00254 bool HasContextOperator() const { return !m_bNoContextOperator; };
00256 bool UseDwdsThesaurus() const { return m_bUseDwdsThesaurus; };
00258 bool OutputBibliographyOfHits() const { return m_bOutputBibliographyOfHits; };
00260 string GetHtmlReference(size_t posFile) const;
00262 string GetShortFilename(size_t posFile) const;
00264 string GetFileNameForCorpusFileNames() const;
00266 string GetFileNameForMaskedFiles() const;
00267
00269 vector<string> GetTokenFields(const COutputToken &tok);
00270
00271
00272
00274 void InitGraphanProperties (CGraphmatFile* piGraphmat) const;
00276 bool WasIndexed() const;
00277
00278
00280 bool LoadSourceFilesAndOptions(string FileName);
00281
00283 bool LoadCorpusFiles();
00284
00286 bool LoadMaskedFiles();
00287
00289 bool SaveOptions(string FileName) const;
00290
00292 bool SaveCorpusFileList() const;
00293
00295 bool LoadProject(string FileName);
00296
00297
00298
00299
00300
00301
00302
00304 bool StartIndexing();
00306 bool DestroyIndex();
00308 bool NormalEndIndexing();
00310 bool TerminateIndexing();
00312 bool IndexOneFile(CGraphmatFile* piGraphmat, string FileName, const char* pFileBuffer, const CDwdsThesaurus* pDwdsThesaurus, CTokenNo& CorpusEndTokenNo,string& strError);
00314 void CalculateSearchPeriods (DWORD MaxTokenCountInOnePeriod);
00316 bool CreateAsUnion(const CConcIndexator& _X1, const CConcIndexator& _X2);
00318 bool CreateMorphIndex();
00320 DWORD GetMaxTokenCountInOnePeriod() const;
00322 string GetIndexItemSetByVectorString (const vector<string>& TokenProperties, bool bRegexp );
00323
00324 };
00325
00326
00328
00333 class CConcIndexatorInvoker
00334 {
00335 string GetTimeStatisticsFileName(string Path) const;
00336 string GetErrorLogFileName(string Path) const;
00337 void WriteTimeStatistics (const CConcIndexator& Indexator, DWORD CorpusEndTokenNo, DWORD MaxTokenCountInOnePeriod) const;
00338 bool FinalizeIndex (CConcIndexator& Indexator, DWORD CorpusEndTokenNo, DWORD MaxTokenCountInOnePeriod) const;
00339 bool BuildOnlyMorphIndex (string ProjectFile) const;
00340 public:
00342 bool m_bStoppedByUser;
00344 bool m_bCorporaProcessing;
00346 bool m_bOnlyReindexMorphology;
00348 bool m_bSkipInitialFileChecking;
00349
00351 mutable string m_CurrMessage;
00352
00354 bool m_bStdout;
00355
00357 int m_CurrentSourceFileNo;
00358
00360 int m_SourceFilesNumber;
00361
00363 string m_CurrentSourceFileName;
00364
00366 mutable CMyTimeSpanHolder m_Profiler;
00367
00368 CConcIndexatorInvoker ();
00369
00371 void SetCurrMessage( string Message) const;
00372
00374 bool BuildIndex(string ProjectFile);
00375 };
00376
00377
00378
00379
00380 #endif