Source for file search-fileindex-defs.php

Documentation is available at search-fileindex-defs.php

  1. <?php
  2. /* ******************************************************************** */
  3. /* CATALYST PHP Source Code */
  4. /* -------------------------------------------------------------------- */
  5. /* This program is free software; you can redistribute it and/or modify */
  6. /* it under the terms of the GNU General Public License as published by */
  7. /* the Free Software Foundation; either version 2 of the License, or */
  8. /* (at your option) any later version. */
  9. /* */
  10. /* This program is distributed in the hope that it will be useful, */
  11. /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
  12. /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
  13. /* GNU General Public License for more details. */
  14. /* */
  15. /* You should have received a copy of the GNU General Public License */
  16. /* along with this program; if not, write to: */
  17. /* The Free Software Foundation, Inc., 59 Temple Place, Suite 330, */
  18. /* Boston, MA 02111-1307 USA */
  19. /* -------------------------------------------------------------------- */
  20. /* */
  21. /* Filename: search-fileindex-defs.php */
  22. /* Author: Paul Waite */
  23. /* Description: Search Engine Module */
  24. /* Specialised indexing class for indexing file content. */
  25. /* */
  26. /* ******************************************************************** */
  27. /** @package search */("search-index-defs.php");
  28.  
  29. /**
  30. * The file indexer class.
  31. * This class indexes files on disc, either one by one or as a whole
  32. * file hierarchy tree.
  33. * @package search
  34. */
  35. class fileindexer {
  36. // Public
  37. /** Application we are indexing for */
  38.  
  39. var $application = "";
  40. /** Host to connect to */
  41.  
  42. var $host = "";
  43. /** Port to connect to */
  44.  
  45. var $port = "";
  46.  
  47. // Private
  48. /** The index ID
  49. @access private */
  50. var $ixid;
  51. /** ID generation source
  52. @access private */
  53. var $idsource = ID_FROM_INC;
  54. /** Scan for meta tags as fields in file content. Recommended.
  55. @access private */
  56. var $metascan = true;
  57. /** Meta fields definitions array. Contains definitions
  58. for the fields we will process if found as meta tags.
  59. @access private */
  60. var $meta_fields = array();
  61. /** Index fields definitions array. Contains definitions
  62. for the fields we are expecting to index.
  63. @access private */
  64. var $field_definitions = array();
  65. /** Fields for indexing. This is an array of fieldname/value
  66. pairs which should be added during the indexing. These
  67. fields do not have to appear in $field_definitions.
  68. @access private */
  69. var $indexfields = array();
  70. /** ID generation offset
  71. @access private */
  72. var $idoffset = 0;
  73. /** ID generation prefix
  74. @access private */
  75. var $idprefix = "";
  76. /** Timeout for indexing commands in seconds (can usually leave
  77. as nullstring)
  78. @access private */
  79. var $timeoutsecs = "";
  80. /** Number of milli-seconds to wait nicely between indexing calls.
  81. @access private */
  82. var $nice_msecs = 0;
  83. /** Indexing execution timer
  84. @access private */
  85. var $timer;
  86. // .....................................................................
  87. /**
  88. * Constructor
  89. * Create a new file indexer
  90. * @param string $application Application name
  91. * @param string $host Hostname or IP of search engine server
  92. * @param string $port Port of search engine server
  93. */
  94. function fileindexer($application="?", $host="", $port="") {
  95. // Store for reference..
  96. $this->application = $application;
  97. $this->host = $host;
  98. $this->port = $port;
  99. $this->timer = new microtimer();
  100. } // fileindexer
  101. // .....................................................................
  102. /**
  103. * Define a field. We supply the name of the field, it's type (Text, Date
  104. * or Id), and whether it should be stored by the search engine for later
  105. * retreival in queries. For example you would not store the raw
  106. * document/content as this is usually stored elsewhere.
  107. * IMPORTANT NOTE: Fields defined here will automatically be included as
  108. * meta fields.
  109. * @see meta_fields()
  110. * @param string $fieldname Name of the field to index
  111. * @param string $type Type of field data: Text, Date or Id.
  112. * @param boolean $stored If true then search engine will store the content itself
  113. * @param boolean $indexed If true then search engine will index the field content
  114. */
  115. function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
  116. $this->field_definitions[$fieldname]
  117. = $type . "|" . (($stored) ? "true" : "false") . "|" . (($indexed) ? "true" : "false");
  118. // Register for meta tags..
  119. $this->meta_field($fieldname, $type);
  120. } // define_field
  121. // .....................................................................
  122. /**
  123. * Define a field as a meta tag. This ensures that the field will be
  124. * picked up from the file meta tags, if present. If it is not listed
  125. * here then it will be ignored.
  126. * IMPORTANT NOTE: We define the strict rule that ONLY fields which have
  127. * been defined here can be added to the indexing via the meta tag scanning.
  128. * Ie. you must define fields here explicitly, or via the define_field()
  129. * method, or they will be ignored even if they turn up as a meta tag.
  130. * This is so we can restrict the indexing, and be sure of field types.
  131. * @see define_field()
  132. * @param string $fieldname Name of the field to process as meta tag
  133. * @param string $type Type of field data: Text, Date or Id.
  134. * @param boolean $stored If true then search engine will store the content itself
  135. * @param boolean $indexed If true then search engine will index the field content
  136. */
  137. function meta_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
  138. $this->meta_fields[$fieldname] = $type;
  139. if (strtolower($fieldname) != "id" && !isset($this->field_definitions[$fieldname])) {
  140. $this->define_field($fieldname, $type, $stored, $indexed);
  141. }
  142. } // meta_field
  143. // .....................................................................
  144. /**
  145. * Supply field content for indexing. This causes the search engine to take
  146. * the given fieldname and index the given value against it.
  147. * The field name can have the field type included in the form 'Foo:Date',
  148. * where 'Date' is the type in this instance. In fact, since 'Text' is the
  149. * default filed type, 'Date' is probably the only one you need to use
  150. * as the current implementation stands.
  151. * @param string $fieldname Name of the field to index.
  152. * @param string $fieldvalue Content of the field to index
  153. */
  154. function index_field($fieldname, $fieldvalue) {
  155. $this->indexfields[$fieldname] = $fieldvalue;
  156. } // index_field
  157. // .....................................................................
  158. /**
  159. * Set the source for ID generation. Since we are indexing a bunch of
  160. * files, the ID's have to be generated on demand inside the loop. So
  161. * we provide for various ways here, and you can extend this class to
  162. * provide more if required.
  163. * Main ways:
  164. * ID_FROM_INC Increment a counter by 1 each time (with offset)
  165. * ID_FROM_NAME Take the filename, strip the extension, add prefix
  166. * ID_FROM_FILENAME Take the full filename, add prefix
  167. * ID_FROM_PATH Take the full file path
  168. * NB: These are all defined as integer constants.
  169. * @param integer $idsource Source of ID generation
  170. * @param mixed $pfxofs String prefix, or integer offset
  171. */
  172. function id_generate($idsource=ID_FROM_INC, $pfxofs="") {
  173. $this->idsource = $idsource;
  174. if ($pfxofs != "") {
  175. if (is_string($pfxofs)) {
  176. $this->idprefix = $pfxofs;
  177. }
  178. else {
  179. $this->idoffset = (int)$pfxofs;
  180. }
  181. }
  182. } // id_generate
  183. // .....................................................................
  184. /**
  185. * Flag that we should do a tag scan on the content of the files to try
  186. * and extract fields to index. Note that any tags thus found will only
  187. * be used if the field name has been defined with the method define_field();
  188. * This causes both the <title> tag and <meta> tags to be considered.
  189. * @see fileindexer::define_field()
  190. */
  191. function scantags() {
  192. $this->metascan = true;
  193. } // scantags
  194. // .....................................................................
  195. /**
  196. * Flag that we should NOT do a tag scan on the content of the files.
  197. */
  198. function noscantags() {
  199. $this->metascan = false;
  200. } // noscantags
  201. // .....................................................................
  202. /**
  203. * Index a file located at the given path, using given ID.
  204. * You can also use the parameter $fields to supply an array of
  205. * fieldname/value pairs to index with this file, for one-off indexing of
  206. * files. If the fieldname is a date field, make sure to define the
  207. * name as 'Foo:Date', to cause the field definition to be correct.
  208. * @param string $path Path to the head of the file tree to index
  209. * @param string $id ID to associate with the indexed file content
  210. * @param mixed $fields Array of field/values to index with file
  211. */
  212. function index_file($path, $id, $fields=false) {
  213. $success = false;
  214. $f = new quickfile_read($path);
  215. if ($f->readok) {
  216.  
  217. // Create the index message..
  218. $ix = new searchengine_indexer($this->application, $this->host, $this->port);
  219.  
  220. // Define the fields for the index message..
  221. if (count($this->field_definitions) > 0) {
  222. foreach ($this->field_definitions as $fieldname => $attributes) {
  223. $bits = explode("|", $attributes);
  224. $type = $bits[0];
  225. $stored = (strcasecmp($bits[1], "true") == 0);
  226. $indexed = (strcasecmp($bits[2], "true") == 0);
  227. $ix->define_field($fieldname, $type, $stored, $indexed);
  228. }
  229. }
  230. else {
  231. debugbr("notice no defined fields found - indexing will use default properties file defs", DBG_DUMP);
  232. }
  233.  
  234. // Scan file content for meta tags for index fields..
  235. $content = preg_replace("/[\xe2][\x80]./", "", $f->content);
  236. $content = preg_replace("/[\xc2][\xb7]./", "", $content);
  237. $content = preg_replace("/[\xc2]&/", " ", $content);
  238. $content = preg_replace("/[\xc3]&/", " ", $content);
  239.  
  240. if ($this->metascan) {
  241. $tagpat = "/<meta name=\"(.*?)\" content=\"(.*?)\">/i";
  242. $matches = array();
  243. if (preg_match_all($tagpat, $content, $matches)) {
  244. for ($i=0; $i < count($matches[0]); $i++) {
  245. $fieldname = $matches[1][$i];
  246. $fieldvalue = $matches[2][$i];
  247. if (isset($this->meta_fields[$fieldname])) {
  248. // Get type..
  249. $type = $this->meta_fields[$fieldname];
  250. if (!strcasecmp($type, "date")) {
  251. // Newsquest date field format requires stripping off a prefix
  252. // 'DT' - a temporary hack which should be completely transparent
  253. // to everyone else using this. NB: originally NewsQuest only
  254. // stored date in 'DTdd/mm/yyyy' format. This parsing is also
  255. // compatible with the new 'DTdd/mm/yyyy hh:mm[:ss]' format.
  256. if (substr($fieldvalue, 0, 2) == "DT") {
  257. $fieldvalue = substr($fieldvalue, 2);
  258. }
  259. // Need to convert to Unix timestamp..
  260. $ts = displaydate_to_timestamp($fieldvalue);
  261. $fieldvalue = $ts;
  262. }
  263. debugbr("meta tag index field: $fieldname=$fieldvalue", DBG_DUMP);
  264. $ix->index_field($fieldname, $fieldvalue);
  265. }
  266. else {
  267. debugbr("rejected unlisted tag field: $fieldname", DBG_DUMP);
  268. }
  269. }
  270. }
  271. // Check for title tag in HTML page if required field..
  272. if (preg_match("/<(title)>(.*?)<\/title>/i", $content, $matches)) {
  273. $fieldname = $matches[1];
  274. $fieldvalue = $matches[2];
  275. if (isset($this->meta_fields[$fieldname])) {
  276. $type = $this->meta_fields[$fieldname];
  277. debugbr("title tag index field: $fieldname=$fieldvalue", DBG_DUMP);
  278. $ix->index_field($fieldname, $fieldvalue);
  279. }
  280. }
  281. } // metascan
  282.  
  283. // Deal with passed-in field settings. These are meant to cater
  284. // for indexing of individual files using this method. We just
  285. // add them to any existing field/values already set up..
  286. if ($fields) {
  287. reset($fields);
  288. while (list($fieldname, $fieldvalue) = each($fields)) {
  289. $this->index_field($fieldname, $fieldvalue);
  290. }
  291. }
  292.  
  293. // Process field/value pairs which have been added either by the
  294. // index_field() method, or passed in via the $fields parameter..
  295. if (count($this->indexfields) > 0) {
  296. reset($this->indexfields);
  297. while (list($fieldname, $fieldvalue) = each($this->indexfields)) {
  298. $bits = explode(":", $fieldname);
  299. $type = ((isset($bits[1])) ? $bits[1] : "Text");
  300. $fieldname = $bits[0];
  301. debugbr("index field: $fieldname=$fieldvalue", DBG_DUMP);
  302. $ix->define_field($fieldname, $type);
  303. $ix->index_field($fieldname, $fieldvalue);
  304. }
  305. }
  306.  
  307. // Index the file content. We get rid of any HTML tags..
  308. debugbr("indexing file: $path, ID=$id", DBG_DUMP);
  309. $ix->index_content($id, strip_tags($content));
  310.  
  311. // Send the index message to the search engine. We specify a large
  312. // timeout since we really want this to succeed and search engine
  313. // may be in an optimization fugue..
  314. $success = $ix->execute(120);
  315. if(!$success) {
  316. debugbr("failed: $ix->error_msg", DBG_DEBUG);
  317. }
  318. }
  319. else {
  320. debugbr("open failed on '$path'", DBG_DEBUG);
  321. }
  322. return $success;
  323. } // index_file
  324. // .....................................................................
  325. /**
  326. * Index a tree of files starting at the path given. We index these in one
  327. * of four modes, which determines how we generate the ID for each item:
  328. * 'ID_FROM_INC' mode uses an incremental counter starting at 1. If $prefix
  329. * holds a number, the counter will start at this number instead of one.
  330. * Each item has an ID incremented by one from the last one.
  331. * 'ID_FROM_NAME' mode uses the filename, stripped of any path and extension
  332. * as the ID. If prefix is not a nullstring, then it is prefixed to every
  333. * filename ID.
  334. * 'ID_FROM_FILENAME' mode uses the filename, including any extension
  335. * as the ID. If prefix is not a nullstring, then it is prefixed to every
  336. * filename ID.
  337. * 'ID_FROM_PATH' mode uses the full path to the item being indexed as the
  338. * ID. If prefix is not a nullstring, then it is prefixed to every
  339. * filename ID.
  340. * The file will simply be indexed as a single Text field, with the
  341. * appropriate ID, and no other index fields unless $metascan is set to TRUE.
  342. * If this is the case, the system will scan the file for HTML meta tags of
  343. * form: '<meta name="foo" content="bar">'. In this example a field of name
  344. *'foo' would be given value 'bar'.
  345. * @param string $path Path to the head of the file tree to index
  346. * @param $patt Pattern to match, eg. '*.html'
  347. * @param $mode "file": read $path as file of paths, "dir": recurse $path as dir
  348. * @param integer $nice_msecs Time to nicely wait between index calls
  349. * @return array List of 3 counts: $done, $succeeded, $failed
  350. */
  351. function index_tree($path, $patt="", $mode="dir", $nice_msecs=0) {
  352. // Init return vars
  353. $done = 0; $succeeded = 0; $failed = 0; $last = 0;
  354. // Store 'nice' delay..
  355. $this->nice_msecs = intval($nice_msecs);
  356. if ($mode == "file") {
  357. $tmpfname = $path;
  358. debugbr("indexing from item list file at $path", DBG_DEBUG);
  359. }
  360. else {
  361. // Use find to generate item list to a temporary file..
  362. debugbr("generating item list by recursing $path", DBG_DEBUG);
  363. $tmpfname = tempnam("/tmp", "LU");
  364. $cmd = "find $path";
  365. if ($patt != "") $cmd .= " -name \"$patt\"";
  366. $cmd .= " >$tmpfname";
  367. exec($cmd);
  368. }
  369. $treelist = new inputfile($tmpfname);
  370. if ($treelist->opened) {
  371. // Find the number of items..
  372. debugbr("counting items", DBG_DEBUG);
  373. $todo = (int) exec("cat $tmpfname|wc -l");
  374. if ($todo > 0) {
  375. debugbr("$todo items to index", DBG_DEBUG);
  376. $this->timer->start();
  377. $idix = 0;
  378. if ($this->idsource == ID_FROM_INC) {
  379. $idix += $this->idoffset;
  380. }
  381.  
  382. while ($path = $treelist->readln()) {
  383. // Generate an ID to use..
  384. switch ($this->idsource) {
  385. case ID_FROM_INC:
  386. // Use incremented index..
  387. $id = $idix + 1;
  388. $idix += 1;
  389. break;
  390.  
  391. case ID_FROM_NAME:
  392. // Use filename, minus extenaion..
  393. $fname = basename($path);
  394. if (strstr($fname, ".")) {
  395. $bits = explode(".", $fname);
  396. $dummy = array_pop($bits);
  397. $fname = implode(".", $bits);
  398. }
  399. $id = $this->idprefix . $fname;
  400. break;
  401.  
  402. case ID_FROM_FILENAME:
  403. // Use full filename..
  404. $id = $this->idprefix . basename($path);
  405. break;
  406.  
  407. case ID_FROM_PATH:
  408. // Use full file path..
  409. $id = $this->idprefix . $path;
  410. break;
  411. } // switch
  412.  
  413. // Index the file with new ID..
  414. if ($this->index_file($path, $id)) {
  415. debugbr("$id indexed", DBG_DEBUG);
  416. $succeeded += 1;
  417. }
  418. else {
  419. debugbr("$path index failed", DBG_DEBUG);
  420. //break;
  421. $failed += 1;
  422. }
  423.  
  424. // Progress check..
  425. $done += 1;
  426.  
  427. // If the verbose output option is enabled, we compile
  428. // stats and display these via the debugger..
  429. if (debugging()) {
  430. $pct = ($done / $todo) * 100;
  431. $pct_int = (int)(floor($pct));
  432. $pct_mod = $pct % 5;
  433. if ($pct_mod == 0 && $pct_int > $last) {
  434. $secperdoc = $this->timer->secs() / $done;
  435. $timedone = $this->timer->formatted_time();
  436. $timeleft = nicetime(($todo - $done) * $secperdoc);
  437. $ms = $this->timer->millisecs();
  438. $msper = number_format( ($ms / $done), 0);
  439. debugbr("Mark: $pct_int% $timedone ($done) Rate:$msper" . "ms/item Left:$timeleft", DBG_DEBUG);
  440. $last = $pct_int;
  441. }
  442. // Insert 'nice' delay here if specified..
  443. if ($this->nice_msecs > 0) {
  444. usleep($this->nice_msecs * 1000);
  445. }
  446. }
  447. } // while
  448. // Success flag - one or more failures is an error
  449. $success = ($failed == 0);
  450.  
  451. // Close tree list file..
  452. $treelist->closefile();
  453.  
  454. // Wrap it up..
  455. $this->timer->stop();
  456.  
  457. // Final stats if verbose mode..
  458. if (debugging()) {
  459. $secs = $this->timer->secs();
  460. $msper = number_format( (1000 * $secs / $todo), 2);
  461. $sper1000 = number_format( ($secs / $todo) * 1000, 2);
  462. debugbr("time taken per item: " . $msper . "msec", DBG_DEBUG);
  463. debugbr("time per 1000 items: " . nicetime($sper1000), DBG_DEBUG);
  464. debugbr("total time taken: " . $this->timer->formatted_time(), DBG_DEBUG);
  465. if ($this->nice_msecs > 0) {
  466. debugbr("nice delay per item: " . $this->nice_msecs . "msec", DBG_DEBUG);
  467. }
  468. debugbr("successfully indexed: $succeeded", DBG_DEBUG);
  469. debugbr("indexing failures: $failed", DBG_DEBUG);
  470. }
  471. }
  472. else {
  473. debugbr("nothing to index", DBG_DEBUG);
  474. }
  475. }
  476. else {
  477. debugbr("failed to open $tmpfname", DBG_DEBUG);
  478. }
  479. // Return value
  480. return array($done, $succeeded, $failed);
  481. } // index_tree
  482.  
  483. } // fileindexer class
  484. // ----------------------------------------------------------------------
  485.  
  486. ?>

Documentation generated by phpDocumentor 1.3.0RC3