DTA::TokWrap::Document - DTA tokenizer wrappers: document wrapper
use DTA::TokWrap::Document;
##========================================================================
## Constructors etc.
$doc = $CLASS_OR_OBJECT->new(%args);
%defaults = $CLASS->defaults();
$doc = $doc->init();
$doc->DESTROY();
##========================================================================
## Methods: Pseudo-I/O
$newdoc = CLASS_OR_OBJECT->open($xmlfile,%docNewOptions);
$bool = $doc->close();
@notempkeys = $doc->notempkeys();
@tempfiles = $doc->tempfiles();
##========================================================================
## Methods: pseudo-pseudo-make
$bool = $doc->genKey($key);
$keyval_or_undef = $doc->makeKey($key);
##========================================================================
## Methods: Low-Level: generator-subclass wrappers
$doc_or_undef = $doc->mkindex();
$doc_or_undef = $doc->mkbx0();
$doc_or_undef = $doc->mkbx();
$doc_or_undef = $doc->tokenize();
$doc_or_undef = $doc->tok2xml();
$doc_or_undef = $doc->txmlanno();
##========================================================================
## Methods: Member I/O
$bx0doc_or_undef = $doc->loadBx0File();
$cxdata_or_undef = $doc->loadBxFile();
$cxdata_or_undef = $doc->loadCxFile();
\$tokdata_or_undef = $doc->loadTokFile();
\$xtokdata_or_undef = $doc->loadXtokFile();
$xtokDoc = $doc->xtokDoc();
\$xmlbuf_or_undef = $doc->loadXmlData();
\$txtbuf_or_undef = $doc->loadTxtData();
$file_or_undef = $doc->saveBx0File();
$file_or_undef = $doc->saveBxFile();
$file_or_undef = $doc->saveTxtFile();
$file_or_undef = $doc->saveTokFile();
$file_or_undef = $doc->saveXtokFile();
$file_or_undef = $doc->saveTcfFile();
##========================================================================
## Methods: Profiling
$ntoks_or_undef = $doc->nTokens();
$nxbytes_or_undef = $doc->nXmlBytes();
DTA::TokWrap::Document provides a perl class for representing a single DTA base-format XML file and associated indices. Together with the DTA::TokWrap module, this class comprises the top-level API of the DTA::TokWrap distribution.
DTA::TokWrap::Document inherits from DTA::TokWrap::Base.
$TOKENIZE_CLASS
Default tokenizer sub-processor class (default='DTA::TokWrap::Processor::tokenize').
Field indices in .cx files generated by the mkindex() method.
$doc = $CLASS_OR_OBJECT->new(%args);
Low-level constructor for document wrapper object. You should probably use either DTA::TokWrap->open() or DTA::TokWrap::Document->open() instead of calling this constructor directly.
%args, %$doc:
##-- Document class
class => $class, ##-- delegate call to $class->new(%args)
##
##-- Source data
xmlfile => $xmlfile, ##-- source filename
xmlbase => $xmlbase, ##-- xml:base for generated files (default=basename($xmlfile))
xmldata => $xmldata, ##-- source buffer (for addws, tcfencode)
##
##-- pseudo-make options
traceMake => $level, ##-- log-level for makeKey() trace (e.g. 'debug'; default=undef (none))
traceGen => $level, ##-- log-level for genKey() trace (e.g. 'trace'; default=undef (none))
traceProc => $level, ##-- log-level for document-called processor calls (default=none)
traceLoad => $level, ##-- log-level for load* trace (default=none)
traceSave => $level, ##-- log-level for save* trace (default=none)
genDummy => $bool, ##-- if true, generator will not actually run (a la `make -n`)
##
##-- generator data (optional)
tw => $tw, ##-- a DTA::TokWrap object storing individual generators
traceOpen => $leve, ##-- log-lvel for open() trace (e.g. 'info'; default=undef (none))
traceClose => $level, ##-- log-level for close() trace (e.g. 'trace'; default=undef (none))
##
##-- generated data (common)
outdir => $outdir, ##-- output directory for generated data (default=.)
tmpdir => $tmpdir, ##-- temporary directory for generated data (default=$ENV{DTATW_TMP}||$outdir)
keeptmp => $bool, ##-- if true, temporary document-local files will be kept on $doc->close()
notmpre => $regex, ##-- non-temporary filename regex
notmpkeys => $keys, ##-- non-temporary keys, space-separated list
outbase => $filebase, ##-- output basename (default=`basename $xmlbase .xml`)
format => $level, ##-- default formatting level for XML output
##
##-- mkindex data (see DTA::TokWrap::Processor::mkindex)
cxfile => $cxfile, ##-- character index file (default="$tmpdir/$outbase.cx")
cxdata => $cxdata, ##-- character index data (see loadCxFile() method)
sxfile => $sxfile, ##-- structure index file (default="$tmpdir/$outbase.sx")
txfile => $txfile, ##-- raw text index file (default="$tmpdir/$outbase.tx")
##
##-- mkbx0 data (see DTA::TokWrap::Processor::mkbx0)
bx0doc => $bx0doc, ##-- pre-serialized block-index XML::LibXML::Document
bx0file => $bx0file, ##-- pre-serialized block-index XML file (default="$outbase.bx0"; optional)
##
##-- mkbx data (see DTA::TokWrap::Processor::mkbx)
bxdata => \@bxdata, ##-- block-list, see DTA::TokWrap::mkbx::mkbx() for details
bxfile => $bxfile, ##-- serialized block-index CSV file (default="$tmpdir/$outbase.bx"; optional)
txtfile => $txtfile, ##-- serialized & hinted text file (default="$tmpdir/$outbase.txt"; optional)
txtdata => $txtdata, ##-- serialized & hinted text file (used by tcfencode, must be loaded explicitly with loadTxtData())
##
##-- tokenize data (see DTA::TokWrap::Processor::tokenize, DTA::TokWrap::Processor::tokenize::dummy)
tokdata0 => $tokdata0, ##-- tokenizer output data (slurped string)
tokfile0 => $tokfile0, ##-- tokenizer output file (default="$tmpdir/$outbase.t0"; optional)
##
##-- post-tokenize data (see DTA::TokWrap::Processor::tokenize1)
tokdata1 => $tokdata1, ##-- post-tokenizer output data (slurped string)
tokfile1 => $tokfile1, ##-- post-tokenizer output file (default="$tmpdir/$outbase.t1"; optional)
##
##-- tokenizer xml data (see DTA::TokWrap::Processor::tok2xml)
xtokdata => $xtokdata, ##-- XML-ified tokenizer output data
xtokfile => $xtokfile, ##-- XML-ified tokenizer output file (default="$outdir/$outbase.t.xml")
xtokdoc => $xtokdoc, ##-- XML::LibXML::Document for $xtokdata (parsed from string)
##
##-- tokenizer xml annotations (see DTA::TokWrap::Processor::txmlanno)
axtokdata => $axtokdata, ##-- optional external XML annotation data (for splicing into $xtokdata)
axtokfile => $axtokfile, ##-- optional external XML annotation file (for splicing into $xtokfile; default="$outdir/$outbase.ta.xml")
xtokfile0 => $xtokfile0, ##-- XML-ified tokenizer output file (default=none or "$outdir/$outbase.t0.xml" if {keeptmp} is true)
##
##-- ws-splice (see DTA::TokWrap::Processor::addws)
#cwsdata => $cwsdata, ##-- ws-spliced output data (xmlfile with <s> and <w> elements)
cwsfile => $cwsfile, ##-- ws-spliced output file (default="$outdir/$outbase.cws.xml")
##
##-- property-splice (see DTA::TokWrap::Processor::idsplice)
## cwstbasebufr => \$bdata, ##-- base data-ref for idsplice (xml with //*/@id) [default=\$cwsdata if defined]
## cwstbasefile => $bfile, ##-- source file for $bdata [default=$cwsfile]
## cwstsobufr => \$sodata, ##-- standoff data-ref for idsplice (xml with //*/@id, additional attributes and content) [default=\$xtokdata]
## cwstsofile => $sofile, ##-- source file for $sodata [default=$xtokfile]
## cwstbufr => $wstbufr, ##-- idsplice output buffer (base + id-spliced attributes, content) -- available for override, not used by default
## cwstfile => $wstfile, ##-- idsplice output file [default="$outdir/$outbase.cwst.xml"]
##
##-- tcfencode data (see DTA::TokWrap::Processor::tcfencode)
tcfdoc => $tcfdoc, ##-- XML::LibXML::Document representing TCF-encoded data
tcffile => $tcffile, ##-- TCF file
tcflang => $lang, ##-- TCF language attribute (default: 'de')
##
##-- tcftokenize data (see DTA::TokWrap::Processor::tcftokenize)
tcftokdoc => $tcftokdoc, ##-- XML::LibXML::Document representing tokenized TCF data (== $tcfdoc)
tcftokfile => $tcftokfile, ##-- tcf-tokenized file
##
##-- tcfdecode0 data (see DTA::TokWrap::Processor::tcfdecode0)
tcfxfile => $tcfxfile, ##-- tcf-decoded base xml file [default="$tmpdir/$outbase.tcfx"]
tcfxdata => $tcfxdata, ##-- tcf-decoded base xml data
tcftfile => $tcftfile, ##-- tcf-decoded serial text file [default="$tmpdir/$outbase.tcft"]
tcftdata => $tcftdata, ##-- tcf-decoded serial txt data
tcfwdata => $tcfwdata, ##-- tcf-decoded token data, tt-format: "TEXT\tSID/WID\n"
tcfwfile => $tcfwfile, ##-- tcf-decoded token file, tt-format [default="$tmpdir/$outbase.tcfw"]
tcfadata => $tcfadata, ##-- tcf-decoded token attributes for idsplice, data
tcfafile => $tcfafile, ##-- tcf-decoded token attributes for idsplice, file [default="$tmpdir/$outbase.tcfa"]
##
##-- tcfalign data (PROXIED, see DTA::TokWrap::Processor::tcfalign : uses tokdata1,tokfile1)
##-- tcf2txml data (PROXIED, see DTA::TokWrap::Processor::tok2xml : uses tokfile1,cxfile,bxfile,xtokdata)
##-- tcfdecode data
tcfcwsfile => $tcfcwsfile, ##-- tcf-decoded+aligned+ws-spliced output file (default="$outdir/$outbase.tcfws.xml")
%defaults = CLASS->defaults();
Static object defaults.
$doc = $doc->init();
Set computed object defaults.
$doc->DESTROY();
Destructor. Implicitly calls close().
$newdoc = $CLASS_OR_OBJECT->open($xmlfile,%docNewOptions);
Wrapper for $CLASS_OR_OBJECT->new(), with some additional sanity checks.
$bool = $doc->close();
$bool = $doc->close($is_destructor);
"Closes" document $doc, adding profiling information to $doc->{tw} if present.
Unlinks any temporary files in $doc unless $doc->{keeptmp} is true. All %$doc keys ending in 'file' are considered 'temporary' files, except: xmlfile, xtokfile, sosfile, sowfile, soafile
If $is_destructor is false (default), resets all keys in %$doc to default values (thus making $doc essentially unuseable).
@notempkeys = $doc->notempkeys();
Returns list of document keys ending 'file' which are not considered "temporary" Used by $doc->tempfiles().
@tempfiles = $doc->tempfiles();
Returns list of temporary filenames which have been generated by $doc, or an empty list if $doc->{keeptmp} is true. Used by $doc->close().
Checks $doc->{"${filekey}_stamp"} to determine whether this document generated the file named by $doc->{"$filekey"}.
Implementation: returns values of all %$doc keys ending with 'file' except for those returned by $doc->notempkeys()
%KEYGEN = ($dataKey => $generatorSpec, ...)
Low-level hash mapping data keys to the generating processes (subroutines, classes, ...).
$generatorSpec is one of:
$key : calls $doc->can($key)->($doc)
\&coderef : calls &coderef($doc)
\@array : array of atomic $generatorSpecs (keys or CODE-refs)
$bool = $doc->genKey($key);
$bool = $doc->genKey($key,\%KEYGEN)
(Re-)generate a data key (single step only, ignoring dependencies). An argument $key without a value $KEYGEN{$key} triggers an error.
$keyval_or_undef = $doc->makeKey($key);
Just an alias for $doc->genKey($key) here, but see DTA::TokWrap::Document::Maker for a more sophisticated implementation
$doc_or_undef = $doc->mkindex($mkindex);
$doc_or_undef = $doc->mkindex();
$doc_or_undef = $doc->mkbx0($mkbx0);
$doc_or_undef = $doc->mkbx0();
$doc_or_undef = $doc->mkbx($mkbx);
$doc_or_undef = $doc->mkbx();
$doc_or_undef = $doc->tokenize($tokenize);
$doc_or_undef = $doc->tokenize();
see DTA::TokWrap::Processor::tokenize::tokenize(), DTA::TokWrap::Processor::tokenize::http::tokenize(), DTA::TokWrap::Processor::tokenize::tomasotath::tokenize(), DTA::TokWrap::Processor::tokenize::dummy::tokenize().
Default tokenizer subclass is given by package-global $TOKENIZE_CLASS.
$doc_or_undef = $doc->tokenize1($tokenize1);
$doc_or_undef = $doc->tokenize1();
$doc_or_undef = $doc->tok2xml($tok2xml);
$doc_or_undef = $doc->tok2xml();
$doc_or_undef = $doc->txmlanno($txmlanno);
$doc_or_undef = $doc->txmlanno();
$doc_or_undef = $doc->addws($addws);
$doc_or_undef = $doc->addws();
$doc_or_undef = $doc->idsplice($addws);
$doc_or_undef = $doc->idsplice();
$doc_or_undef = $doc->tcfencode($tcfencode)
$doc_or_undef = $doc->tcfencode()
$bx0doc_or_undef = $doc->loadBx0File($filename_or_fh);
$bx0doc_or_undef = $doc->loadBx0File();
loads $doc->{bx0doc} from $filename_or_fh (default=$doc->{bx0file})
$cxdata_or_undef = $doc->loadBxFile($bxfile_or_fh,$txtfile_or_fh);
$cxdata_or_undef = $doc->loadBxFile();
loads $doc->{bxdata} from @$doc{qw(bxfile txtfile)}
requires $doc->{txfile}
$cxdata_or_undef = $doc->loadCxFile($filename_or_fh);
$cxdata_or_undef = $doc->loadCxFile();
loads $doc->{cxdata} from $filename_or_fh (default=$doc->{cxfile}).
$doc->{cxdata} = [ $cx0, ... ], where:
each $cx = [ $id, $xoff,$xlen, $toff,$tlen, $text, @attrs ]
package globals $CX_ID, $CX_XOFF, etc. are indices for $cx arrays
\$tokdata_or_undef = $doc->loadTokFileN($n,$filename_or_fh);
\$tokdata_or_undef = $doc->loadTokFileN($n);
loads $doc->{"tokdata${n}"} from $filename_or_fh (default=$doc->{"tokfile${n}"})
\$tokdata0_or_undef = $doc->loadTokFile0(@args)
Wrapper for $doc->loadTokFileN(0,@args)
\$tokdata1_or_undef = $doc->loadTokFile1(@args)
Wrapper for $doc->loadTokFileN(1,@args)
\$xtokdata_or_undef = $doc->loadXtokFile($filename_or_fh);
\$xtokdata_or_undef = $doc->loadXtokFile();
loads $doc->{xtokdata} from $filename_or_fh (default=$doc->{xtokfile})
see also $doc->xtokDoc().
$xtokDoc = $doc->xtokDoc(\$xtokdata);
$xtokDoc = $doc->xtokDoc();
parse \$xtokdata (default: \$doc->{xtokdata}) string into $doc->{xtokdoc}
warning: may call $doc->tok2xml()
$xmlbuf_or_undef = $doc-E<gt>loadXmlData($filename_or_fh)
$xmlbuf_or_undef = $doc-E<gt>loadXmlData()
loads $doc->{xmldata} from $filename_or_fh (default=$doc->{xmlfile}).
\$xmlbuf_or_undef = $doc->loadCwsData($filename_or_fh)
\$xmlbuf_or_undef = $doc->LoadCwsData()
DEPRECATED
loads $doc->{cwsdata} from $filename_or_fh (default=$doc->{cwsfile}).
\$txtbuf_or_undef = $doc->loadTxtData($filename_or_fh)
\$txtbuf_or_undef = $doc->loadTxtData()
loads $doc->{txtdata} from $filename_or_fh (default=$doc->{txtfile})
$file_or_undef = $doc->saveBx0File($filename_or_fh,$bx0doc,%opts);
$file_or_undef = $doc->saveBx0File($filename_or_fh);
$file_or_undef = $doc->saveBx0File();
Saves $bx0doc (default=$doc->{bx0doc}) to $filename_or_fh (default=$doc>{bx0file}="$doc->{outdir}/$doc->{outbase}.bx0"), and sets both $doc>{bx0file} and $doc->{bx0file_stamp}.
%opts:
format => $level, ##-- output format (default=$doc-E<gt>{format})
$file_or_undef = $doc->saveBxFile($filename_or_fh,\@blocks);
$file_or_undef = $doc->saveBxFile($filename_or_fh);
$file_or_undef = $doc->saveBxFile();
Saves text-block data \@blocks (default=$doc->{bxdata}) to $filename_of_fh (default=$doc->{bxfile}), and sets both $doc->{bxfile} and $doc->{bxfile_stamp}.
$file_or_undef = $doc->saveTxtFile($filename_or_fh,\@blocks,%opts);
$file_or_undef = $doc->saveTxtFile($filename_or_fh);
$file_or_undef = $doc->saveTxtFile();
Saves serialized text extracted from \@blocks (default=$doc->{bxdata}) to $filename_or_fh (default=$doc->{txtfile}="$doc->{outdir}/$doc->{outbase}.txt"), and sets both $doc->{txtfile} and $doc->{txtfile_stamp}.
%opts:
debug=>$bool, ##-- if true, debugging text will be printed (and saveBxFile() offsets will be wrong)
$file_or_undef = $doc->saveTokFileN($n,$filename_or_fh,\$tokdata);
$file_or_undef = $doc->saveTokFileN($n,$filename_or_fh);
$file_or_undef = $doc->saveTokFileN($n);
Saves tokenizer output data string $tokdata (default=$doc->{"tokdata${n}"}) to $filename_or_fh (default=$doc->{"tokfile${n}"}="$doc->{outdir}/$doc->{outbase}.t${n}"), and sets both $doc->{"tokfile${n}"} and $doc->{"tokfile_stamp${n}"}.
$file_or_undef = $doc->saveTokFile0(@args)
Wrapper for $doc->saveTokFileN(0,@args)
$file_or_undef = $doc->saveTokFile1(@args)
Wrapper for $doc->saveTokFileN(1,@args)
$file_or_undef = $doc->saveXtokFile($filename_or_fh,\$xtokdata,%opts);
$file_or_undef = $doc->saveXtokFile($filename_or_fh);
$file_or_undef = $doc->saveXtokFile();
Saves XML-ified master tokenizer data string $xtokdata (default=$doc->{xtokdata}) to $filename_or_fh (default=$doc->{xtokfile}="$doc->{outdir}/$doc->{outbase}.t.xml"), and sets both $doc->{xtokfile} and $doc->{xtokfile_stamp}.
$file_or_undef = $doc->saveTcfFile($filename_or_fh,$tcfdoc,%opts)
$file_or_undef = $doc->saveTcfFile($filename_or_fh)
$file_or_undef = $doc->saveTcfFile()
known %opts:
format => $level, ##-- formatting level (default=1)
Saves TCF-encoded document $tcfdoc (default=$doc->{tcfdoc}) to $filename_or_fh (default=$doc->{tcffile}="$doc->{outdir}/$doc->{outbase}.t.xml"), and sets $doc->{tcffile_stamp}.
$ntoks_or_undef = $doc->nTokens();
Returns number of tokens in the currently opened document, if known.
$nxbytes_or_undef = $doc->nXmlBytes();
Returns the number of bytes in the base-format XML file, if known (and it should always be known!).
DTA::TokWrap::Intro(3pm), dta-tokwrap.perl(1), ...
DTA::TokWrap::Intro(3pm), dta-tokwrap.perl(1), ...
Bryan Jurish <jurish@bbaw.de>
Copyright (C) 2009-2018 by Bryan Jurish
This package is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.14.2 or, at your option, any later version of Perl 5 you may have available.