
source idx_items
{
# data source type
# known types are 'mysql', 'pgsql', 'xmlpipe', 'xmlpipe2'
type = mysql
# these are db connection parameters
sql_host = localhost
sql_user = username
sql_pass = password
sql_db = feeddb
sql_port = 3306
# range queries are to reduce the size of the result set
# being returned as you are indexing. This can be particularly
# helpful with MyISAM to avoid long locks
# this particular query sets obtains the $start and $stop values
# that are used in the main data query determined by the next parameter,
# sql_range_step
sql_query_range = SELECT MIN(id),MAX(id) FROM itemtab
# the size of the result set. So,if the complete number of items is 1M,
# then there will be 10 data obtaining queries until all items have
# been indexed
sql_range_step = 100000
# a query to run prior to the main data obtaining query to set the
# counter table position. In this case, the position in the pointer
# table is set to the max id of the items table, for the index named
# host:items_idx. This is a way to have the counter table provide
# accounting not only for the local index, but for indices on
# other servers running Sphinx.
sql_query_pre = UPDATE sph_counter SET max_id= (SELECT MAX(id) FROM items) WHERE index_name = 'host:items_idx'
# main data query. These are the columns that are indices. The first
# column must be the primary key or unique key
sql_query = SELECT id, item_date, title, content, author, concat('feed_id:',feed_id) as feed_id, item_date FROM v_items WHERE id >= $start AND id <= $end
# another attribute that sphinx will use to index by, required for
# grouping.
sql_attr_timestamp = item_date
# other types
# sql_attr_float - float, double
# sql_attr_str2ordinal - string in from source, 32-bit uint number out
# sql_attr_multi - list of 32-bit uints
# sql_attr_bigint - in version 0.9.9
# This is the query for debugging, using the 'search' utility, which it
# uses this query to obtain data from the database
sql_query_info = SELECT id, concat('feed_id:',feed_id) as feed_id, link,
title, content, author FROM v_items WHERE id=$id;
# there are other options that aren't listed here for brevity.
}
# "items_delta" inherits all settings, unless specified/overridden, from
# it's parent, "items".
# So, the format is "child : parent" obviously
source items_delta : items
{
# only two parameters are overridden here. First, sql_query_pre, to
# update the counter table for the delta index entry, and the range
# query, which obtains the range for any id greater than the max item
# id of the main index
sql_query_pre = UPDATE sph_counter SET max_id= (SELECT MAX(id) FROM
items) WHERE index_name = 'items_delta_idx:host'
sql_query_range = SELECT MIN(id),MAX(id) FROM v_items WHERE id > (SELECT max_id FROM sph_counter WHERE index_name = 'items_idx:host')
}
index items_idx
{
source = items # the "items" source defined about
# path of the index file
path = /usr/local/sphinx/var/data/items_idx
# yes, you are not limited to 3, like FULLTEXT!
min_word_len = 1
# we used utf8
charset_type = utf-8
# strip out HTML as the index is built
html_strip = 1
}
# this child index, we only override two parameters
index items_stemmed_idx : index
{
path = /usr/local/sphinx/var/data/items_stemmed_idx
# this creates an index with stemming
morphology = stem_en
}
# now for the delta index
index items_delta_idx : index
{
# we override the data source to get the proper range of data
source = items_delta
path = /usr/local/sphinx/var/data/items_delta_idx
}
This is a distrubuted index, allowing you to specify local and
remote agents
index items_dist : items_idx
{
# required
type = distributed
# All indices specified as "local", will search serially, but in
# parallel with agents.
# "local" means +1 index to search serially within this single master
# process. The next line below, commented out:
# local = items_idx,items_delta_idx
# is almost identical to
agent = localhost:3312:items_idx, items_delta_idx
# except that the option with "agent=localhost..." would fork an
# additional child
# "agent" means +1 remote agent to contact wherever it is
# and gives better performance for searches in parallel,
# in this example, the index list is grouped together because
# items_delta_idx is a small index compared to items_idx, and there
# wouldn't be much benefit to having it's own agent
agent = hostB:3312:items_idx,items_delta_idx
# NOTES from Andrew:
# The master process sends out all queries, the does all the local
# stuff if any, then waits for all the remove replies.
# Hence, 1 fork + serial local search VS 2 forks + just wait"
}
index items_multiparts : items_idx
{
# Different than the delta index setup, if you have multiple large
# indexes, you would want to list them on
# separate lines.
agent = hostC:3312:items_part1
agent = hostC:3312:items_part2
agent = hostC:3312:items_part3
agent = hostD:3312:items_part4
agent = hostD:3312:items_part5
agent = hostE:3312:items_part6
}
CREATE TABLE sph_counter ( counter_id int(11) NOT NULL, max_id bigint(20) unsigned NOT NULL, index_name varchar(32) NOT NULL default '', last_updated timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP, PRIMARY KEY (`counter_id`), KEY `index_name` (`index_name`) )
mysql> select max_id, index_name, last_updated from sph_counter; +-------------+--------------------------+---------------------+ | max_id | index_name | last_updated | +-------------+--------------------------+---------------------+ | 135414507 | items_idx:hostA | 2008-09-25 03:00:01 | | 135565199 | items_delta_idx:hostB | 2008-09-25 13:05:01 | | 135301579 | items_idx:hostB | 2008-09-24 23:07:35 | | 135563903 | items_delta_idx:hostB | 2008-09-25 13:02:25 | +-------------+--------------------------+---------------------+
License:GPL: del.icio.us tag/gpl
python
search
gpl
searchengine
fulltext
opensource
License:GPL