Source for file lucene-fileindex-defs.php

Documentation is available at lucene-fileindex-defs.php

  1. <?php
  2. /* ******************************************************************** */
  3. /* CATALYST PHP Source Code */
  4. /* -------------------------------------------------------------------- */
  5. /* This program is free software; you can redistribute it and/or modify */
  6. /* it under the terms of the GNU General Public License as published by */
  7. /* the Free Software Foundation; either version 2 of the License, or */
  8. /* (at your option) any later version. */
  9. /* */
  10. /* This program is distributed in the hope that it will be useful, */
  11. /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
  12. /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
  13. /* GNU General Public License for more details. */
  14. /* */
  15. /* You should have received a copy of the GNU General Public License */
  16. /* along with this program; if not, write to: */
  17. /* The Free Software Foundation, Inc., 59 Temple Place, Suite 330, */
  18. /* Boston, MA 02111-1307 USA */
  19. /* -------------------------------------------------------------------- */
  20. /* */
  21. /* Filename: lucene-fileindex-defs.php */
  22. /* Author: Paul Waite */
  23. /* Description: Search Engine Module */
  24. /* Specialised indexing class for indexing file content. */
  25. /* Still tied to the deprecated lucene-defs.php module. */
  26. /* */
  27. /* ******************************************************************** */
  28. /** @package search */* The file indexer class.
  29. * This class indexes files on disc, either one by one or as a whole
  30. * file hierarchy tree.
  31. * @package search
  32. */
  33. class fileindexer {
  34. // Public
  35. /** Application we are indexing for */
  36.  
  37. var $application = "";
  38. /** Host to connect to */
  39.  
  40. var $host = "";
  41. /** Port to connect to */
  42.  
  43. var $port = "";
  44.  
  45. // Private
  46. /** The index ID
  47. @access private */
  48. var $ixid;
  49. /** ID generation source
  50. @access private */
  51. var $idsource = ID_FROM_INC;
  52. /** Scan for meta tags as fields in file content. Recommended.
  53. @access private */
  54. var $metascan = true;
  55. /** Meta fields definitions array. Contains definitions
  56. for the fields we will process if found as meta tags.
  57. @access private */
  58. var $meta_fields = array();
  59. /** Index fields definitions array. Contains definitions
  60. for the fields we are expecting to index.
  61. @access private */
  62. var $field_definitions = array();
  63. /** Fields for indexing. This is an array of fieldname/value
  64. pairs which should be added during the indexing. These
  65. fields do not have to appear in $field_definitions.
  66. @access private */
  67. var $indexfields = array();
  68. /** ID generation offset
  69. @access private */
  70. var $idoffset = 0;
  71. /** ID generation prefix
  72. @access private */
  73. var $idprefix = "";
  74. /** Timeout for indexing commands in seconds (can usually leave
  75. as nullstring)
  76. @access private */
  77. var $timeoutsecs = "";
  78. /** Path to a lockfile we should give way to. If this value
  79. is not nullstring, then no indexing will be done while the
  80. file exists. If lockfile_wait is > 0, then we only wait
  81. this many seconds.
  82. @access private */
  83. var $lockfile = "";
  84. /** Number of seconds to wait on a lockfile. If zero, wait forever.
  85. @access private */
  86. var $lockfile_wait_secs = 0;
  87. /** Indexing execution timer
  88. @access private */
  89. var $timer;
  90. // .....................................................................
  91. /**
  92. * Constructor
  93. * Create a new file indexer
  94. * @param string $application Application name
  95. * @param string $host Hostname or IP of search engine server
  96. * @param string $port Port of search engine server
  97. */
  98. function fileindexer($application="?", $host="", $port="") {
  99. // Store for reference..
  100. $this->application = $application;
  101. $this->host = $host;
  102. $this->port = $port;
  103. $this->timer = new microtimer();
  104. } // fileindexer
  105. // .....................................................................
  106. /**
  107. * Define a field. We supply the name of the field, it's type (Text, Date
  108. * or Id), and whether it should be stored by the search engine for later
  109. * retreival in queries. For example you would not store the raw
  110. * document/content as this is usually stored elsewhere.
  111. * IMPORTANT NOTE: Fields defined here will automatically be included as
  112. * meta fields.
  113. * @see meta_fields()
  114. * @param string $fieldname Name of the field to index
  115. * @param string $type Type of field data: Text, Date or Id.
  116. * @param boolean $stored If true then search engine will store the content itself
  117. * @param boolean $indexed If true then search engine will index the field content
  118. */
  119. function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
  120. $this->field_definitions[$fieldname]
  121. = $type . "|" . (($stored) ? "true" : "false") . "|" . (($indexed) ? "true" : "false");
  122. // Register for meta tags..
  123. $this->meta_field($fieldname, $type);
  124. } // define_field
  125. // .....................................................................
  126. /**
  127. * Define a lockfile which we must avoid during indexing. If defined
  128. * then no indexing will take place while the lockfile exists. The
  129. * second parameter allows you to specify a limit to the patience of
  130. * this process, in seconds. Zero means wait forever.
  131. * @param string $lockfile Path to the lockfile. Nullstring = not defined
  132. * @param integer $wait_secs Time to wait for lockfile. Zero means forever.
  133. */
  134. function avoid_lockfile($lockfile, $wait_secs=0) {
  135. $this->lockfile = $lockfile;
  136. $this->lockfile_wait_secs = $wait_secs;
  137. } // avoid_lockfile
  138. // .....................................................................
  139. /**
  140. * Define a field as a meta tag. This ensures that the field will be
  141. * picked up from the file meta tags, if present. If it is not listed
  142. * here then it will be ignored.
  143. * IMPORTANT NOTE: We define the strict rule that ONLY fields which have
  144. * been defined here can be added to the indexing via the meta tag scanning.
  145. * Ie. you must define fields here explicitly, or via the define_field()
  146. * method, or they will be ignored even if they turn up as a meta tag.
  147. * This is so we can restrict the indexing, and be sure of field types.
  148. * @see define_field()
  149. * @param string $fieldname Name of the field to process as meta tag
  150. * @param string $type Type of field data: Text, Date or Id.
  151. */
  152. function meta_field($fieldname, $type) {
  153. $this->meta_fields[$fieldname] = $type;
  154. } // meta_field
  155. // .....................................................................
  156. /**
  157. * Supply field content for indexing. This causes the search engine to take
  158. * the given fieldname and index the given value against it.
  159. * The field name can have the field type included in the form 'Foo:Date',
  160. * where 'Date' is the type in this instance. In fact, since 'Text' is the
  161. * default filed type, 'Date' is probably the only one you need to use
  162. * as the current implementation stands.
  163. * @param string $fieldname Name of the field to index.
  164. * @param string $fieldvalue Content of the field to index
  165. */
  166. function index_field($fieldname, $fieldvalue) {
  167. $this->indexfields[$fieldname] = $fieldvalue;
  168. } // index_field
  169. // .....................................................................
  170. /**
  171. * Set the source for ID generation. Since we are indexing a bunch of
  172. * files, the ID's have to be generated on demand inside the loop. So
  173. * we provide for various ways here, and you can extend this class to
  174. * provide more if required.
  175. * Main ways:
  176. * ID_FROM_INC Increment a counter by 1 each time (with offset)
  177. * ID_FROM_NAME Take the filename, strip the extension, add prefix
  178. * ID_FROM_FILENAME Take the full filename, add prefix
  179. * ID_FROM_PATH Take the full file path
  180. * NB: These are all defined as integer constants.
  181. * @param integer $idsource Source of ID generation
  182. * @param mixed $pfxofs String prefix, or integer offset
  183. */
  184. function id_generate($idsource=ID_FROM_INC, $pfxofs="") {
  185. $this->idsource = $idsource;
  186. if ($pfxofs != "") {
  187. if (is_string($pfxofs)) {
  188. $this->idprefix = $pfxofs;
  189. }
  190. else {
  191. $this->idoffset = (int)$pfxofs;
  192. }
  193. }
  194. } // id_generate
  195. // .....................................................................
  196. /**
  197. * Flag that we should do a tag scan on the content of the files to try
  198. * and extract fields to index. Note that any tags thus found will only
  199. * be used if the field name has been defined with the method define_field();
  200. * This causes both the <title> tag and <meta> tags to be considered.
  201. * @see fileindexer::define_field()
  202. */
  203. function scantags() {
  204. $this->metascan = true;
  205. } // scantags
  206. // .....................................................................
  207. /**
  208. * Flag that we should NOT do a tag scan on the content of the files.
  209. */
  210. function noscantags() {
  211. $this->metascan = false;
  212. } // noscantags
  213. // .....................................................................
  214. /**
  215. * Index a file located at the given path, using given ID.
  216. * You can also use the parameter $fields to supply an array of
  217. * fieldname/value pairs to index with this file, for one-off indexing of
  218. * files. If the fieldname is a date field, make sure to define the
  219. * name as 'Foo:Date', to cause the field definition to be correct.
  220. * @param string $path Path to the head of the file tree to index
  221. * @param string $id ID to associate with the indexed file content
  222. * @param mixed $fields Array of field/values to index with file
  223. */
  224. function index_file($path, $id, $fields=false) {
  225. $success = false;
  226. $f = new inputfile($path);
  227. if ($f->opened) {
  228. $f->readall();
  229. $f->closefile();
  230.  
  231. // Wait for a lockfile, if we really have to..
  232. if ($this->lockfile != "" && file_exists($this->lockfile)) {
  233. $waitforit = true;
  234. debugbr("waiting for lockfile..", DBG_DEBUG);
  235. if ($this->lockfile_wait_secs > 0) {
  236. $locktimer = new microtimer();
  237. $locktimer->start();
  238. }
  239. do {
  240. clearstatcache();
  241. if (!file_exists($this->lockfile)) {
  242. $waitforit = false;
  243. debugbr("lockfile has been removed..", DBG_DEBUG);
  244. }
  245. elseif ($this->lockfile_wait_secs > 0 && $locktimer->secs() >= $this->lockfile_wait_secs) {
  246. $waitforit = false;
  247. debugbr("lockfile wait (" . $this->lockfile_wait_secs ."secs) timed out..", DBG_DEBUG);
  248. }
  249. else {
  250. sleep(1);
  251. }
  252. } while ($waitforit === true);
  253. }
  254.  
  255. // Create the index message..
  256. $ix = new lucene_indexmsg($this->application, $this->host, $this->port);
  257.  
  258. // Define the fields for the index message..
  259. foreach ($this->field_definitions as $fieldname => $attributes) {
  260. $bits = explode("|", $attributes);
  261. $type = $bits[0];
  262. $stored = (strcasecmp($bits[1], "true") == 0);
  263. $indexed = (strcasecmp($bits[2], "true") == 0);
  264. $ix->define_field($fieldname, $type, $stored, $indexed);
  265. }
  266.  
  267. // Scan file content for meta tags for index fields..
  268. $content = preg_replace("/[\xe2][\x80]./", "", $f->content);
  269. $content = preg_replace("/[\xc2][\xb7]./", "", $content);
  270. $content = preg_replace("/[\xc2]&/", " ", $content);
  271. $content = preg_replace("/[\xc3]&/", " ", $content);
  272.  
  273. if ($this->metascan) {
  274. $tagpat = "/<meta name=\"(.*?)\" content=\"(.*?)\">/i";
  275. $matches = array();
  276. if (preg_match_all($tagpat, $content, $matches)) {
  277. for ($i=0; $i < count($matches[0]); $i++) {
  278. $fieldname = $matches[1][$i];
  279. $fieldvalue = $matches[2][$i];
  280. if (isset($this->meta_fields[$fieldname])) {
  281. // Get type..
  282. $type = $this->meta_fields[$fieldname];
  283. if (!strcasecmp($type, "date")) {
  284. // Newsquest date field format requires stripping off a prefix
  285. // 'DT' - a temporary hack which should be completely transparent
  286. // to everyone else using this. NB: originally NewsQuest only
  287. // stored date in 'DTdd/mm/yyyy' format. This parsing is also
  288. // compatible with the new 'DTdd/mm/yyyy hh:mm[:ss]' format.
  289. if (substr($fieldvalue, 0, 2) == "DT") {
  290. $fieldvalue = substr($fieldvalue, 2);
  291. }
  292. // Need to convert to Unix timestamp..
  293. $ts = displaydate_to_timestamp($fieldvalue);
  294. $fieldvalue = $ts;
  295. }
  296. debugbr("meta tag index field: $fieldname=$fieldvalue");
  297. $ix->index_field($fieldname, $fieldvalue);
  298. }
  299. else {
  300. debugbr("rejected unlisted tag field: $fieldname");
  301. }
  302. }
  303. }
  304. // Check for title tag in HTML page if required field..
  305. if (preg_match("/<(title)>(.*?)<\/title>/i", $content, $matches)) {
  306. $fieldname = $matches[1];
  307. $fieldvalue = $matches[2];
  308. if (isset($this->meta_fields[$fieldname])) {
  309. $type = $this->meta_fields[$fieldname];
  310. debugbr("title tag index field: $fieldname=$fieldvalue");
  311. $ix->index_field($fieldname, $fieldvalue);
  312. }
  313. }
  314. } // metascan
  315.  
  316. // Deal with passed-in field settings. These are meant to cater
  317. // for indexing of individual files using this method. We just
  318. // add them to any existing field/values already set up..
  319. if ($fields) {
  320. reset($fields);
  321. while (list($fieldname, $fieldvalue) = each($fields)) {
  322. $this->index_field($fieldname, $fieldvalue);
  323. }
  324. }
  325.  
  326. // Process field/value pairs which have been added either by the
  327. // index_field() method, or passed in via the $fields parameter..
  328. if (count($this->indexfields) > 0) {
  329. reset($this->indexfields);
  330. while (list($fieldname, $fieldvalue) = each($this->indexfields)) {
  331. $bits = explode(":", $fieldname);
  332. $type = ((isset($bits[1])) ? $bits[1] : "Text");
  333. $fieldname = $bits[0];
  334. debugbr("index field: $fieldname=$fieldvalue");
  335. $ix->define_field($fieldname, $type);
  336. $ix->index_field($fieldname, $fieldvalue);
  337. }
  338. }
  339.  
  340. // Index the file content. We get rid of any HTML tags..
  341. debugbr("indexing file: $path, ID=$id");
  342. $ix->index_content($id, strip_tags($content));
  343.  
  344. // Send the index message to the search engine. We specify a large
  345. // timeout since we really want this to succeed and search engine
  346. // may be in an optimization fugue..
  347. $success = $ix->send(120);
  348. if(!$success) {
  349. debugbr("failed: $ix->error_msg");
  350. }
  351. }
  352. else {
  353. debugbr("open failed on '$path'");
  354. }
  355. return $success;
  356. } // index_file
  357. // .....................................................................
  358. /**
  359. * Index a tree of files starting at the path given. We index these in one
  360. * of four modes, which determines how we generate the ID for each item:
  361. * 'ID_FROM_INC' mode uses an incremental counter starting at 1. If $prefix
  362. * holds a number, the counter will start at this number instead of one.
  363. * Each item has an ID incremented by one from the last one.
  364. * 'ID_FROM_NAME' mode uses the filename, stripped of any path and extension
  365. * as the ID. If prefix is not a nullstring, then it is prefixed to every
  366. * filename ID.
  367. * 'ID_FROM_FILENAME' mode uses the filename, including any extension
  368. * as the ID. If prefix is not a nullstring, then it is prefixed to every
  369. * filename ID.
  370. * 'ID_FROM_PATH' mode uses the full path to the item being indexed as the
  371. * ID. If prefix is not a nullstring, then it is prefixed to every
  372. * filename ID.
  373. * The file will simply be indexed as a single Text field, with the
  374. * appropriate ID, and no other index fields unless $metascan is set to TRUE.
  375. * If this is the case, the system will scan the file for HTML meta tags of
  376. * form: '<meta name="foo" content="bar">'. In this example a field of name
  377. *'foo' would be given value 'bar'.
  378. * @param string $path Path to the head of the file tree to index
  379. * @param $patt Pattern to match, eg. '*.html'
  380. * @param $restart If equal to "restart" then treat $path as file of paths
  381. * @param $lockfile If path is set, we idle whilst this file exists
  382. * @param string $lockfile Path to the lockfile. Nullstring = not defined
  383. * @param integer $wait_secs Time to wait for lockfile. Zero means forever.
  384. */
  385. function index_tree($path, $patt="", $restart="", $lockfile="", $wait_secs=0) {
  386. // Set up any lockfile definition..
  387. $this->avoid_lockfile($lockfile, $wait_secs);
  388.  
  389. if ($restart == "restart") {
  390. // Restart from existing paths file..
  391. $tmpfname = $path;
  392. debugbr("restarting with existing item list $path", DBG_DEBUG);
  393. }
  394. else {
  395. // Use find to generate item list to a temporary file..
  396. debugbr("generating item list", DBG_DEBUG);
  397. $tmpfname = tempnam("/tmp", "LU");
  398. $cmd = "find $path";
  399. if ($patt != "") $cmd .= " -name \"$patt\"";
  400. $cmd .= " >$tmpfname";
  401. exec($cmd);
  402. }
  403. $treelist = new inputfile($tmpfname);
  404. if ($treelist->opened) {
  405. // Find the number of items..
  406. debugbr("counting items", DBG_DEBUG);
  407. $todo = (int) exec("cat $tmpfname|wc -l");
  408. if ($todo > 0) {
  409. $done = 0; $succeeded = 0; $failed = 0; $last = 0;
  410. debugbr("$todo items to index", DBG_DEBUG);
  411. $this->timer->start();
  412. $idix = 0;
  413. if ($this->idsource == ID_FROM_INC) {
  414. $idix += $this->idoffset;
  415. }
  416.  
  417. while ($path = $treelist->readln()) {
  418. // Generate an ID to use..
  419. switch ($this->idsource) {
  420. case ID_FROM_INC:
  421. // Use incremented index..
  422. $id = $idix + 1;
  423. $idix += 1;
  424. break;
  425.  
  426. case ID_FROM_NAME:
  427. // Use filename, minus extenaion..
  428. $fname = basename($path);
  429. if (strstr($fname, ".")) {
  430. $bits = explode(".", $fname);
  431. $dummy = array_pop($bits);
  432. $fname = implode(".", $bits);
  433. }
  434. $id = $this->idprefix . $fname;
  435. break;
  436.  
  437. case ID_FROM_FILENAME:
  438. // Use full filename..
  439. $id = $this->idprefix . basename($path);
  440. break;
  441.  
  442. case ID_FROM_PATH:
  443. // Use full file path..
  444. $id = $this->idprefix . $path;
  445. break;
  446. } // switch
  447.  
  448. // Index the file with new ID..
  449. if ($this->index_file($path, $id)) {
  450. debugbr("$id indexed", DBG_DEBUG);
  451. $succeeded += 1;
  452. }
  453. else {
  454. debugbr("$path index failed", DBG_DEBUG);
  455. //break;
  456. $failed += 1;
  457. }
  458.  
  459. // Progress check..
  460. $done += 1;
  461.  
  462. // If the verbose output option is enabled, we compile
  463. // stats and display these via the debugger..
  464. if (debugging()) {
  465. $pct = ($done / $todo) * 100;
  466. $pct_int = (int)(floor($pct));
  467. $pct_mod = $pct % 5;
  468. if ($pct_mod == 0 && $pct_int > $last) {
  469. $secperdoc = $this->timer->secs() / $done;
  470. $timedone = $this->timer->formatted_time();
  471. $timeleft = nicetime(($todo - $done) * $secperdoc);
  472. $ms = $this->timer->millisecs();
  473. $msper = number_format( ($ms / $done), 0);
  474. debugbr("Mark: $pct_int% $timedone ($done) Rate:$msper" . "ms/item Left:$timeleft", DBG_DEBUG);
  475. $last = $pct_int;
  476. }
  477. }
  478. } // while
  479.  
  480. // Close tree list file..
  481. $treelist->closefile();
  482.  
  483. // Wrap it up..
  484. $this->timer->stop();
  485.  
  486. // Final stats if verbose mode..
  487. if (debugging()) {
  488. $secs = $this->timer->secs();
  489. $msper = number_format( (1000 * $secs / $todo), 2);
  490. $sper1000 = number_format( ($secs / $todo) * 1000, 2);
  491. debugbr("time taken per item: " . $msper . "msec", DBG_DEBUG);
  492. debugbr("time per 1000 items: " . nicetime($sper1000), DBG_DEBUG);
  493. debugbr("total time taken: " . $this->timer->formatted_time(), DBG_DEBUG);
  494. debugbr("successfully indexed: $succeeded", DBG_DEBUG);
  495. debugbr("indexing failures: $failed", DBG_DEBUG);
  496. }
  497. }
  498. else {
  499. debugbr("nothing to index", DBG_DEBUG);
  500. }
  501. }
  502. else {
  503. debugbr("failed to open $tmpfname", DBG_DEBUG);
  504. }
  505. } // index_tree
  506.  
  507. } // fileindexer class
  508. // ----------------------------------------------------------------------
  509.  
  510. ?>

Documentation generated by phpDocumentor 1.3.0RC3