r26333 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r26332‎ | r26333 | r26334 >
Date:23:10, 2 October 2007
Author:rainman
Status:old
Tags:
Comment:
Experimental:
* extract context for each link and index it
* context retrieval uses:
- StringMap for efficient hashmap serialization
- ObjectCache a simple FIFO cache for caching context fields
* experiment with various scoring schemes, use related as boost for
sloppy phrase matches, rank as boost for exact phrases - probably
will be changed futher

Devel:
* drop link_analysis index, maintain an index of pagelinks and redirect
tables in links index
* fix bug 11103, use FSUtils for all filesystem related operations
* localization: read meta namespaces names per dbname, pass iid to
wikitokenizer, etc..
* RelatedBuilder can build related mapping from both links index and
from dump, cleanup and field compression
* new search method: related (returns the related mapping for article)
Modified paths:
  • /branches/lucene-search-2.1/lsearch-global.conf (modified) (history)
  • /branches/lucene-search-2.1/src/org/apache/lucene (added) (history)
  • /branches/lucene-search-2.1/src/org/apache/lucene/search (added) (history)
  • /branches/lucene-search-2.1/src/org/apache/lucene/search/CustomBoostQuery.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ContextAnalyzer.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/RelatedAnalyzer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SplitAnalyzer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/ResultSet.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Title.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/HttpHandler.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/CleanupParser.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/HighlightDaemon.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiSimilarity.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/ContextParser.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/ObjectCache.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RankBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/StringMap.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/TitleReader.java (deleted) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/CompactRelated.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/LinkReader.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedTitle.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/TitleReader.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankDocValues.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankValueSource.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankValueSourceQuery.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/RelatedStorage.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Command.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/FSUtils.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Localization.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/PHPParser.java (modified) (history)
  • /branches/lucene-search-2.1/test-data/indexing-articles.test (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/lsearch-global.conf
@@ -18,19 +18,20 @@
1919 wikilucene : (nssplit,3) (nspart1,[0]) (nspart2,[4,5,12,13]), (nspart3,[])
2020 wikilucene : (language,en) (warmup,10)
2121 wikilucene : (spell,3,1) (prefix)
 22+enwiki,viwiki,srwiki,eswiki,dewiki,mlwiki,zhwiki,jawiki,itwiki,thwiki : (single)
2223
2324 # Search groups
2425 # Index parts of a split index are always taken from the node's group
2526 # host : db1.part db2.part
2627 # Mulitple hosts can search multiple dbs (N-N mapping)
2728 [Search-Group]
28 -oblak : wikilucene wikidev wikilucene.prefix
 29+oblak : wikilucene wikidev wikilucene.prefix wikilucene.related wikilucene.links
2930
3031 # Index nodes
3132 # host: db1.part db2.part
3233 # Each db.part can be indexed by only one host
3334 [Index]
34 -oblak: wikilucene wikidev
 35+oblak: enwiki wikilucene wikidev viwiki srwiki eswiki dewiki mlwiki zhwiki jawiki itwiki thwiki
3536
3637 # Rsync path where indexes are on hosts, after default value put
3738 # hosts where the location differs
Index: branches/lucene-search-2.1/test-data/indexing-articles.test
@@ -391,3 +391,419 @@
392392 [[tr:Maxwell denklemleri]]
393393 [[zh:麦克斯韦方程组]]
394394
 395+### namespace = 0
 396+### title = Douglas Adams
 397+### content
 398+
 399+==Education and early works==
 400+[[Image:Douglas Adams Sign from HH cover.jpg|200px|right|thumb|Douglas Adams was known to some fans as ''Bop Ad'' - after his illegible signature]]Adams first attended Primrose Hill Primary School in Brentwood. He took the exams and interviewed for [[Brentwood School (England)|Brentwood School]] at age six, and attended the [[Preparatory school (UK)|preparatory school]] from 1959 to 1964, then the main school until 1970. He was in the top stream, and specialised in the arts in the sixth form, after which he stayed an extra term in a special seventh form class, customary in the school for those preparing for [[Oxbridge]] entrance exams.
 401+
 402+While at the prep school, his English teacher, Frank Halford, reportedly
 403+awarded Adams the only ten out of ten of his entire teaching career for a
 404+creative writing exercise.<ref>http://www.bbc.co.uk/dna/h2g2/A3790659</ref>
 405+Adams remembered this for the rest of his life, especially when facing writer's
 406+block.<ref>{{cite book | author=Adams, Douglas | title=The Salmon of Doubt:
 407+Hitchhiking the Galaxy One Last Time | edition=US mass market paperback edition |
 408+publisher=Ballantine | year=2005 | pages=Page xix | id=ISBN 0-345-45529-0}}</ref>
 409+Some of Adams' earliest writing was published at the school, such as a report on
 410+the school's Photography Club in ''The Brentwoodian'' (in 1962) or spoof reviews
 411+in the school magazine ''Broadsheet'' (edited by [[Paul Neil Milne Johnstone]]).
 412+Adams also had a letter and short story published nationally in the UK in the boys'
 413+magazine ''The Eagle'' in 1965. He met [[Griff Rhys Jones]], who was in the year below,
 414+at the school, and was in the same class as "Stuckist" artist
 415+[[Charles Thomson (artist)|Charles Thomson]]; all three appeared together in
 416+a production of Shakespeare's Julius Caesar in 1968. He was six feet tall (1.83 m)
 417+by the time he was 12, and he stopped growing only at 6'5" (1.96 m).
 418+Later, he would often make self-ironic jokes about his own towering stature,
 419+"...the form-master wouldn't say 'Meet under the clock tower,' or
 420+'Meet under the War Memorial,' but 'Meet under Adams.'"
 421+<ref>{{cite book | author=Adams, Douglas | title=The Salmon of Doubt:
 422+Hitchhiking the Galaxy One Last Time | edition=First UK hardcover edition |
 423+publisher=Macmillan | year=2002 | pages=Page 7 | id=ISBN 0-333-76657-1}}</ref>
 424+
 425+### namespace = 0
 426+### title = Aaliyah
 427+### content
 428+
 429+{{Two other uses||Aaliyah's self-titled album|Aaliyah (album)||Aliyah (disambiguation)}}
 430+{{Infobox musical artist <!-- See Wikipedia:WikiProject Musicians -->
 431+| Name = Aaliyah
 432+| Img = Aaliyah5301.jpg<!--fair use image to be used only in this article-->
 433+| Img_capt = Promotional photo of Aaliyah from May 2001
 434+| Img_size = <!-- Only for images narrower than 220 pixels -->
 435+| Landscape =
 436+| Background = solo_singer
 437+| Birth_name = Aaliyah Dana Haughton<ref name="Aaliyah NNDB Profile">{{cite web| url =http://www.nndb.com/people/742/000024670/| title = Aaliyah NNDB Profile| publisher =NNDB| accessdate =2007-03-03}}</ref>
 438+| Alias = Baby Girl, The Princess of Hip-Hop Soul<br>Liyah<br>Wonder Woman<ref>[http://www.imdb.com/name/nm0004691/bio Aaliyah on IMDb]</ref>
 439+| Born = {{birth date|1979|1|16}}<br><small>[[Brooklyn]], [[New York]], [[United States|U.S.]]</small>
 440+| Died = {{death date and age|2001|8|25|1979|1|16}}<br><small>[[Abaco Islands]], [[The Bahamas]]</small>
 441+| Origin = {{Flagicon|USA}} [[Detroit, Michigan|Detroit]], [[Michigan]], [[United States|U.S.]]
 442+| Genre = [[R&B]], [[Hip hop soul]], [[Dance music|Dance]], [[Pop music|Pop]]
 443+| Occupation = [[Singer]], [[Model (person)|model]], [[dancer]], [[actress]]
 444+| Years_active = 1994 – 2001
 445+| Label = [[Blackground Records|Blackground]]
 446+| Associated_acts = [[Missy Elliott]], [[Timbaland]], [[Steve "Static" Garrett|Static]], [[R. Kelly]], [[Ginuwine]], [[Tweet (singer)|Tweet]]
 447+| URL = [http://www.aaliyah.com Aaliyah.com]
 448+}}
 449+'''Aaliyah Dana Haughton'''<ref name="Aaliyah NNDB Profile"/> ([[January 16]], [[1979]] – [[August 25]], [[2001]]), known professionally as '''Aaliyah''', was a [[Grammy Award]] winning [[United States|American]] [[singer]], [[dancer]], [[Model (person)|model]] and [[actress]]. Introduced to audiences by R&B singer [[R. Kelly]], Aaliyah became famous during the mid-1990s with several hit records from the songwriting/production team of [[Missy Elliott]] & [[Timbaland]] and their associate [[Steve "Static" Garrett]]. Aaliyah soon joined Timbaland's R&B and hip hop collective, the [[Superfriends Clique]].
 450+
 451+Notable for recording several hit records, including several number one R&B hits, a number one pop hit, and nine top 10 singles on the [[Billboard Hot 100]]. She also modeled for [[Tommy Hilfiger]] and starred in two [[motion pictures]] before dying in a plane crash in [[the Bahamas]].
 452+
 453+==Early years==
 454+Aaliyah Dana Haughton was born in Bedford Stuyvesant, Brooklyn, New York on January 16, 1979 to Michael and Diane Haughton, and was raised in Detroit, Michigan. Her name means "Highest, Most Exalted, The Best" in [[Arabic language|Arabic]] or "to ascend" in [[Hebrew]]. Aaliyah was brought up as a [[Catholic]] with her older brother [[Rashad Haughton]]. Her grandmother, Mintis L. Hicks Hankerson, was of [[African American]] and [[Native Americans in the United States|Native American]] descent. Diane Haughton, Aaliyah's mother, also a vocalist, encouraged her daughter's career. Her uncle, [[Barry Hankerson]], is a prominent individual in the music industry and Aaliyah's aunt, through marriage to Hankerson, is [[Gladys Knight]], a legendary soul singer with [[Gladys Knight & the Pips]].
 455+
 456+She appeared on the TV talent show ''Star Search'' at age ten, singing her mother's favorite song, "My Funny Valentine". Although she did not win, Aaliyah worked with an agent in New York and began to attend auditions for TV shows, including ''[[Family Matters (TV series)|Family Matters]]''.
 457+
 458+Following her appearance on ''Star Search'' Aaliyah performed on stage in [[Las Vegas]] with Gladys Knight. In her early teens, Aaliyah attended the Detroit High School for the Fine and Performing Arts, and graduated as a dance major with a 4.0 GPA
 459+
 460+==''Age Ain't Nothing But a Number'' (1994)==
 461+[[Image:Aaliyah-age-aint-94.jpg|right|200px|thumb|Cover of ''[[Age Ain't Nothing but a Number]]''.]]
 462+Aaliyah signed with her uncle [[Barry Hankerson]]'s [[Blackground Records]] label in 1993 at the age of 14. She released her debut album, titled ''[[Age Ain't Nothing but a Number]]'', in 1994 at the age of 15. [[R. Kelly]], Aaliyah's then alleged husband, was a leading songwriter and producer on her debut album. The album displayed her smooth and velvety vocals and the production work was said to be original and innovative. The album went [[platinum album|platinum]] within months. The album featured the gold-selling singles "[[Back and Forth (Aaliyah song)|Back and Forth]]" (#1 U.S. R&B, 3 weeks and #5 [[Hot 100]] ), "[[At Your Best]]" (#2 U.S. R&B and #6 [[Hot 100]] ), a cover of [[The Isley Brothers]]' 1976 song, the album-titled single "[[Age Ain't Nothing But A Number (song)|Age Ain't Nothing But A Number]]" (#75 [[Hot 100]] and #35 US R&B, 2 weeks), and "Down with the Clique" (#33 UK Top 75 Singles). "Back and Fourth" was sampled by [[Madonna (entertainer)|Madonna]] for the track, "[[Inside of Me]]" which appears on her 1994 album ''[[Bedtime Stories]]''. In June 1995, Aaliyah released another single to radio only, "No One Knows How to Love Me Quite Like You Do." The album has sold over 3.5 million copies in the U.S. to date and nearly 7 million worldwide.
 463+
 464+In 1994, a rumour surfaced that 15-year-old Aaliyah and 27-year-old R. Kelly had secretly married in the state of [[Illinois]]. Both initially denied. Although many websites and television shows claimed that they found a marriage certificate, it never has been truly proven that R. Kelly and Aaliyah have ever been married.
 465+
 466+==Guest appearances, movie roles and soundtracks (1995 - 2001)==
 467+[[Image:romeo must die dvd.jpg|150px|left|thumb|Aaliyah on the cover of her first film ''[[Romeo Must Die]] {{speedy-image-c}}'' alongside [[Jet Li]]]]
 468+In 1995 at age 16, Aaliyah performed "[[The Star-Spangled Banner]]" live at an [[Orlando Magic]] basketball game. Also during that year, she appeared on the soundtrack for ''[[A Low Down Dirty Shame]]'' with the minor international hit "The Thing I Like" (#33 UK). The song was also included on international versions of [[Age Ain't Nothing But A Number]].
 469+
 470+In 1997, Aaliyah appeared on the soundtrack album for the [[Fox Animation Studios]] animated feature ''[[Anastasia (1997 movie)|Anastasia]]'', singing the pop version of "[[Journey to the Past]]". The song was nominated for an [[Academy Awards|Academy Award]], and Aaliyah performed the song at the 1998 Academy Awards ceremony, becoming the youngest female recording artist to perform at the ceremony. Not only was Aaliyah the youngest female to perform but she was the youngest African American to have the nominee for [[Academy Award for Best Original Song|Best Original Song]].
 471+
 472+Aaliyah had a hit in 1998 with "[[Are You that Somebody]]" (number one airplay U.S. eight weeks), the main single from the ''[[Dr. Dolittle (film)|Dr. Dolittle]]'' soundtrack. Its video was the third most-played on [[MTV]] that year, and the song's success helped make Aaliyah a household name (and making her crowned as Queen of Urban Pop).
 473+
 474+In 2000, she co-starred with [[Jet Li]] in the [[martial arts]] film ''[[Romeo Must Die]]'', which debuted at number one at the box office. Aaliyah and Timbaland executive produced the film's soundtrack album and Aaliyah contributed four songs: "Are You Feelin' Me?," "I Don't Wanna," "Come Back in One Piece," a duet with [[DMX (rapper)|DMX]], and the international number one hit "[[Try Again]]." Aaliyah made history once more when "Try Again" became the first song to ever reach number one on the Billboard Hot 100 based solely on the strength of its radio airplay, without any single sales factored in. After the huge success of "Try Again" at radio, a [[12 inch single|12" maxi single]] was released for consumer purchase. The radio-only single, "I Don't Wanna", (which was also featured on the soundtracks for the films ''[[Next Friday]]'' and ''Romeo Must Die'') peaked at number five on the Billboard Hot R&B/Hip Hop Singles & Tracks chart.
 475+
 476+In 2001, Aaliyah went to [[Australia]] to co-star with [[Stuart Townsend]] in the film ''[[Queen of the Damned (film)|Queen of the Damned]]'', an adaptation of the [[Anne Rice]] novel of the same name. Aaliyah also recorded most of her third studio album, ''[[Aaliyah (album)|Aaliyah]]'', during this time.
 477+
 478+==''One in a Million'' (1996)==
 479+[[Image:aaliyah-one-in-a-million.jpg|200px|right|thumb|Cover of ''[[One in a Million (album)|One In A Million]]''.]]
 480+''[[One in a Million (album)|One In A Million]]'', Aaliyah's sophomore album, was chiefly written and produced by then unknowns [[Missy Elliott]] and [[Timbaland|Timothy "Timbaland" Mosley]] and released on [[August 27]], [[1996]] when she was 17 years old. The album was a landmark in Aaliyah's career, garnering her mass critical acclaim and introducing Aaliyah's more mature side. It embarked the newfound chemistry of Aaliyah and Timbaland. The album was certified double-platinum within a year, making Aaliyah a major R&B star and igniting the successful careers of Missy Elliott and Timbaland. ''One in a Million'' featured the international smash hit "[[If Your Girl Only Knew]]" (number one U.S. R&B, 2 weeks and #11 Hot 100), "[[One in a Million (Aaliyah song)|One In A Million]]," (#1 U.S. R&B airplay, six weeks & #25 US Hot 100 Airplay), the #8 U.S. R&B and #9 [[Hot 100]] single "[[The One I Gave My Heart To]]," a ballad written by [[Diane Warren]], "[[4 Page Letter]]" (#12 R&B Airplay), "[[Hot Like Fire (Timbaland Remix)|Hot Like Fire]]" (two versions) (#31 R&B Airplay), and "[[Got to Give It Up (Aaliyah song)|Got To Give It Up]](#37 UK)" (a remake of the [[1977]] [[Marvin Gaye]] song).
 481+
 482+[[Tommy Hilfiger]] gave Aaliyah her first endorsement deal. He signed Aaliyah onto print campaigns, runway shows, and a commercial. During this period, Aaliyah would also make guest appearances on albums by artists such as [[Missy Elliott]], [[Timbaland & Magoo]], [[Ginuwine]] and [[Playa (band)|Playa]]. [[Timbaland]] and [[Playa]]'s frontman [[Steve "Static" Garrett]] would remain Aaliyah's principal collaborators for the duration of her career. To date, ''One in a Million'' has sold over 3.7 million copies in the U.S. and over 11 million worldwide.
 483+
 484+After the success of ''One in a Million'', in 1997 Aaliyah headlined in her own tour "The Hot Like Fire Tour", in which she toured various major city venues performing hits from Her albums.
 485+
 486+==''Aaliyah'' (2001)==
 487+<!--[[Image:Aaliyah-ep-2001.jpg|200px|left|thumb|Cover of [[Aaliyah (album)|Aaliyah]] commenting out image with no source/bad FairUse claim-->
 488+"[[We Need a Resolution (Aaliyah song)|We Need a Resolution]]," the first single from Aaliyah's third studio album, was released [[April 24]], 2001 (see [[2001 in music]]). The self-titled ''[[Aaliyah (album)|Aaliyah]]'' was released three months later on [[July 17]], [[2001]]. The album was an instant critical success but sales were initially lower than expected, although they increased considerably after her death. ''Aaliyah'' introduced a darker and edgier side to Aaliyah's music and was noted as having showcased her growth as an artist. Around the time of the album's recording and release she had been filming ''Queen of the Damned'', which helped her show a dark and edgy side as her character was a deadly villain. The album debuted at number two on the [[Billboard 200]] chart, selling 190,000 copies in its first week, and was certified gold (500,000 copies sold) within four weeks, before her death. The week after the plane crash it climbed to number one. [[Trent Reznor]] of [[Nine Inch Nails]] was to produce a song on the album but scheduling conflicts did not permit the collaboration.
 489+
 490+==="More Than A Woman" and "Rock The Boat"===
 491+There was no shortage of confusion at the label regarding the next single from the Aaliyah album. Aaliyah had been promoting "[[More Than a Woman (Aaliyah song)|More Than a Woman]]", having performed it twice and shooting a video with director Dave Meyers in the summer of 2001. According to Blackground, a remix featuring State Property and Jay-Z was also planned, but was scrapped due to lack of adequate funds. The video was to be released but "[[Rock the Boat (Aaliyah song)|Rock the Boat]]" began receiving huge amounts of radio-play, so she was immediately sent to the Bahamas to shoot the video. The "[[Rock The Boat]]" music video was put in the 106 and Park hall of fame, making the countdown over 65 times and landed at #2 on [[BET]]'s Top 100 videos of 2001. "[[More Than a Woman]]" made the number-one spot after "Rock the Boat" was retired. "Rock the Boat" was #2 U.S. R&B and #14 Hot 100 single. "I Care 4 U" was #3 U.S. R&B and #16 Hot 100 single. The album went on to sell over 8 million copies worldwide.
 492+
 493+==Death==
 494+On August 25, 2001, at 6:49 pm, just after wrapping up filming of the "Rock the Boat" video, Aaliyah and various members of her record company boarded a twin engine [[Cessna 402]]B (N8097W) at Marsh Harbour, Abaco Island, Bahamas to travel to Opa-locka Airport near Miami, Florida, but the plane crashed shortly after takeoff about 200 feet from the runway. Pilot Luis Morales III and all eight passengers, including Aaliyah, were killed in the aviation incident. According to findings from an inquest conducted by the coroner's office in the Bahamas, Aaliyah suffered from "severe burns and a blow to the head," in addition to severe shock. The coroner theorized that, even if Aaliyah survived the crash, her recovery would have been virtually impossible given the severity of her injuries.<ref>[http://www.caribbeannetnews.com/2003/11/21/aaliyah.htm Bahamas Coroner delivers verdict in Aaliyah death crash.] (2003, November 21). ''Caribbean News''. Retrieved February 9, 2007.</ref>
 495+
 496+Aaliyah's eulogy was held on August 31 at Saint Ignatius Loyola Roman Catholic Church on East 84th Street in Manhattan. A horse-drawn carriage then carried her coffin to [[Ferncliff Cemetery]] in Hartsdale, New York, where she was initially interred in a crypt in the extension wing of the main mausoleum. When the Rosewood Mausoleum was completed a couple of years later, Aaliyah was moved to a private room in the new building. The inscription of her alias ''Baby Girl'' is engraved on her crypt.
 497+
 498+===Investigation===
 499+{{Copypaste}}
 500+
 501+NTSB reports indicate that the pilot, Luis Morales III, was not qualified to pilot the plane he was attempting to fly. Morales falsely obtained his FAA license by showing hundreds of hours never flown, and he may also have falsified how many hours he had flown in order to get a job with his employer, Blackhawk International Airways. Additionally, an autopsy performed on Morales revealed cocaine and alcohol in his blood.
 502+
 503+Further investigations determined the plane was over its total gross weight by several hundred pounds. Although witnesses claimed that the passengers had been asked to leave some luggage behind, it was later discovered that the passengers, including Aaliyah, had not been informed of the excess weight.
 504+
 505+Eddie Golson, president of Pro Freight Cargo Services at Opa-locka Airport, said workers carted "a pickup truck of freight" from the crash site Monday. "That's absurd to think that this pilot got in this airplane with eight other people and a truck full of freight and expected this thing to fly," Golson said. "What the hell was going on?" a baggage handler was reported to have said, in reaction to hearing that no one weighed the passengers or baggage. Two of the passengers, members of Aaliyah's entourage, weighed in the neighborhood of 300 pounds and sat in the rear of the plane, where the baggage was also stored.
 506+
 507+The day of the aviation incident was Mr. Morales' first official day with Blackhawk International Airways, a Part 135 single-pilot operation. He had been employed with Golden Airlines, from which he was fired only four hours before the fatal aviation incident. In addition, Luis Morales III was not registered with the FAA to fly for Blackhawk. As a result of the incident, a wrongful death lawsuit was filed by Aaliyah’s parents and was later settled out of court for an undisclosed amount.
 508+
 509+Barry & Sons, Inc., a corporation formed in 1992 to develop, promote and capitalize on the musical talents of Aaliyah and to oversee the production and distribution of her records, tapes and music videos, brought an unsuccessful lawsuit in the Supreme Court of the State of New York against Instinct Productions LLC, (a company hired by Barry & Sons, Inc. in August, 2001 to produce the "Rock the Boat" music video). The case was dismissed since New York State's wrongful death statute only permits certain people to recover damages for wrongful death.<ref>[http://www.courts.state.ny.us/reporter/3dseries/2005/2005_00096.htm Text of appellate division decision dismissing the case.]</ref>
 510+
 511+==Legacy==
 512+"Rock the Boat" went on to become a [[Posthumous work|posthumous]] hit on radio (reaching number two on Billboard's Hot R&B Singles charts, number 14 on the Hot 100, and number 12 in the UK) and video channels, and the news of Aaliyah's death gave her album a notable sales boost, pushing it to number one on the Billboard 200. The album produced two other singles. "More than A Woman" reached number 7 on Billboard's Hot R&B singles chart, number 25 on Hot 100, and number one in the UK. "I Care 4 U" reached number three on Billboard's Hot R&B singles chart and number 16 on the Hot 100, the latter attaining success even without the promotional push of a [[music video]]. The ''Aaliyah'' album went on to sell over 3 million copies in the U.S. ''[[Queen of the Damned (film)|Queen of the Damned]]'' was released in early 2002. Before its release, Aaliyah's brother Rashad was called upon to re-dub several of his sister's lines during the post-production [[Dubbing (music)|ADR]] process. Upon its release, the film debuted at number one. The film was also dedicated to her.
 513+
 514+In 2001, Missy Elliott released her video for "[[Miss E... So Addictive|Take Away]]". The video contained words and images about Aaliyah. The single also featured Ginuwine and was the debut of Elliott's recent protégé, [[Tweet (singer)|Tweet]].
 515+
 516+Aaliyah and former [[The Beatles|Beatle]] [[George Harrison]] made UK Chart History in January 2002 when they scored the first, and to this date only, back-to-back posthumous number one hits (aside from the [[Elvis Presley]] re-releases in 2005). Aaliyah's "More than a Woman", released on January 7 and topped the chart on January 13, was followed by Harrison's "My Sweet Lord", re-released on January 14 and topped the chart on January 20. Aaliyah was voted one of "The Top 40 Woman of the Video Era" in VH1's The Greatest, also ranked #36 on their list of the 100 Sexiest Artist. Aaliyah also made E!'s list on the 101 Most Shocking Moments in Entertainment, Juciest Hollywood Hookups, and Best Kept Hollywood Secrets. Aaliyah recently ranked at #18 on BET's "Top 25 Dancers of All Time". In 2005, former Co-Star Jet-Li as reported from CNN stated the memory of Aaliyah had haunted him in Vancouver, where he and the late songstress shot the film [[Romeo Must Die]].
 517+
 518+In addition Aaliyah has been the topic of five books: ''Aaliyah: More than a Woman'' (2001) by Christopher John Farley, ''Aaliyah: An R&B Princess in Words and Pictures'' (2002) by Kelly Kenyatta, ''Aaliyah'' by [[Tim Footman]] (2003), and ''Aaliyah Remembered: Her Life & The Person behind the Mystique'' (2005) by William Sutherland.
 519+
 520+"Her legacy is, you can achieve your dreams and still maintain being a beautiful person"
 521+-Brother [[Rashad Haughton]]
 522+
 523+===Lyrical Dedications===
 524+
 525+* [[Boyz II Men]]: "Think Of Aaliyah" a.k.a. "The Aaliyah Song" - "''When you think of Aaliyah, laugh, don't cry, cause I know she want it that way''".
 526+* [[Brandy (entertainer)|Brandy]]: "Turn It Up" - ''"Get Baby Girl's attention, she's more than a woman and we sure do miss her. I wanna represent her, Timbaland, Missy, you get the picture."'',"Should I Go" - "''this industry was more like a different world, when it was just me, Monica, and Baby Girl [Aaliyah], I never got to tell you how much you meant / I wish you and me both was sittin' here workin with Tim / Just to be in the presence of people that you affected on a personal level just makes me stop for a second. [inhales and exhales] You were such a blessing, you helped me answer all of my questions."''
 527+* [[D12]]: "9-11" - ''"We lost Aaliyah, lost our families, it takes no tenges. You don't need us to see the world is (messed) up, God can see it"''
 528+* [[Foxy Brown]]: "Big Bad Mama" - ''"Rhyme deep in footwear, via Spiga/ Like Aaliyah, One in a Million/There's MC's in this rap shit comin in illin/ like I did, laid the groundwork for five hits/ Member when I told y'all first week out/ Shipped a half a mil, niggaz freaked out/ Love yourself, put no one above thee/ Cause ain't nobody gon' fuck me like me, it's on'"''
 529+
 530+* [[Jadakiss]]: "Why" - ''"Why Aaliyah had to take that flight?"''
 531+* [[Jay-Z]]: "Miss You Remix" - names certain people who missed her after her death.
 532+* [[Juelz Santana]]: "One Day I Smile" - ''"Once again a deep thought of Aaliyah crosses my mind"''
 533+* [[Kanye West]]: "Never Let Me Down" - ''"But I can't complain what the accident did to my Left Eye / Cuz look what an accident did to Left Eye / First Aaliyah and now Romeo must die / I know I got angels watching me from the other side"''
 534+* [[Layzie Bone]]: "For The Thugs That's Gone" - " Too many celebrities perish, these people we love and cherish, and I had a chance to meet Aaliyah, but I was to embarrassed and I should of took a chance, I heard that from a man, Jam Master J was so real, you niggas don't understand, he told me to handle my business, make sure I pay my taxes, a little advice from a legend to keep my paper stacking, and I gotta give props to Eazy, that nigga put me on, if he didn't believe in the thugs you all wouldn't of heard of Bone.
 535+* [[Lil' Flip]]: "Hall of Fame Graveyard" - ''"From Eazy E to Aaliyah, we even lost Left Eye / How come the wack rappers live but the best die"''
 536+* [[Mary J. Blige]] - "[[MJB Da MVP]]" - ''"It was when Aaliyah died / I could hardly sleep / Thought about it every day / and it made Me change my ways"''
 537+* [[Missy Elliott]]: "Can You Hear Me?" - ''"I been checkin' on your moms and dad / And your brother since the day you left / Passed on and went away with God / But for your mama it's been / So damn hard / I hate to even hear her cry / Aaliyah she asked me why / Would her baby girl go this way / Can you give me better words to say / Cause One day she'll see you again / With the same old beautiful smile / Long hair and the voice of a hummingbird / You'll be singing them same old songs / Aaliyah can you hear me? / I hope that you're proud of me / Me and Tim we been doing our thing / But it's never been the same / Since you had to go / I ain't never met a friend / More incredible"''
 538+* [[Monica (singer)|Monica]]: "Go To Bed Mad" - ''"Argue about things so critical / And you heated over nothing / And just hang up the phone / I want / I wanna talk in the mood / See we need a resolution / Like that Aaliyah song"''
 539+* [[Mya]]: "After The Rain" - ''"No one could ever fill your shoes, you're one in a million"''
 540+* [[Nas]] featuring [[Quan (rapper)|Quan]]: "Just A Moment" - ''"And can we please have a moment to mourn? / For Pac, Biggie and Pun 'cause through us they live on / Jam Master Jay, Freaky Ty and Aaliyah / Big L and Left Eye, when we die we hope to see ya"''
 541+* [[Ray J]]: "War Is Over" - ''"One day one day one day / I hope to see my girl Aaliyah"''
 542+* [[The Game (rapper)|The Game]]: "Dreams" - ''"Martin Luther King had a dream, Aaliyah had a dream, Left Eye had a dream"'', "Runnin" - ''"God let me in, give me a room by Aaliyah with ESPN"''
 543+* [[T.I.]]: "Rubberband Man" - ''"throw your lighters up for my cousin Toot, Aaliyah, Left Eye, and Jam Master Jay"''
 544+* [[TQ]] : "Gone But Not Forgotten" - ''"Aaliyah, I wish we could've did a song, but baby girl when I get my wings, I'm gonna send your precious love"''
 545+* [[Wyclef Jean]]: "Industry" - ''"Back and forth and forth and back / Like Miss Aaliyah man do I miss her"''
 546+* [[Outkast]] ft. [[Killer Mike]]: "The Whole World" - ''"Mami, I'm coming, I hope u get off / Or rock your own boat like Aaliyah don taught / Back, back and forth, forth / Get that sailor on course course"''
 547+* French R'n'b singer [[Assia]] covered "Don't know what to tell ya" with French and Arabic lyrics and entitled it "Le prix pour t' aimer (Habibi Maareft Ach'n Oullek)" in her latest album "Encore et Encore".
 548+The Gossip - (covered are you that somebody) as a tribute to Aaliyah
 549+
 550+Others include tracks by ''[[DMX]], [[Yolanda Adams]], [[Tyrese]], [[R. Kelly]], [[TLC (band)|TLC]], [[Timbaland]] & Outsiderz 4 Life''.{{Fact|date=March 2007}}
 551+
 552+* [[Cooper C.]]: "Why...?" - "Imma rock da boat, Aaliyah, and be wit you. One day, hopefully I will see you too."
 553+
 554+===Unfinished Films===
 555+Aaliyah was to have had a supporting role as Zee, the wife of [[Harold Perrineau Jr.]]'s character, Link, in the two sequels to ''[[The Matrix]]''. The directors initially tried to find a way to incorporate her footage into the movies but decided against it due to lack of material available. The role was recast with [[Nona Gaye]] playing the character. Other films in which Aaliyah was signed to star in were ''[[Honey (2003 film)|Honey]]'' (which instead was filmed with [[Jessica Alba]] as the main character), and a [[Whitney Houston]]-produced remake of the 1976 film ''[[Sparkle (1976 film)|Sparkle]]'' (now in the works with [[Raven-Symoné]] as the main character).
 556+
 557+In addition, Aaliyah and one of her agents had pitched and signed a deal with Fox Searchlight Pictures for her to star in a film based upon a true story about interracial love.
 558+
 559+==''I Care 4 U'' (2002)==
 560+[[Image:aaliyah icare4u.jpg|right|thumb|200 px|Cover of ''[[I Care 4 U]]''.]]
 561+''I Care 4 U'' was released by Blackground Records on December 10, 2002. Along with her hit singles, a number of previously unreleased tracks were included on the album, including "[[Erica Kane]]", "Don't Worry" and "All I Need" and the new singles "Miss You", "Don't Know What to Tell Ya", and "Come Over." ''I Care 4 U'' debuted at an impressive #3 on the Billboard 200 and #1 on the R&B album charts (where it remained for 7 weeks). The album went on to sell over 2.6 million in the U.S. and 5 million worldwide.
 562+
 563+The video for "Miss You" features [[Missy Elliott]], [[Toni Braxton]], [[Lil' Kim]], [[Dallas Austin]], MTV presenter and close friend [[Ananda Lewis]], actor/singer [[Jamie Foxx]], [[AJ Calloway]], [[Free (rapper)|Free]], [[Quddus (MTV)|Quddus]], Missy's recent protegé and longtime friend [[Tweet]], [[U-God]] (of the ''[[Wu-Tang Clan]]'') and [[DMX (rapper)|DMX]], [[Rosario Dawson]], among others, paying tribute to Aaliyah. Following her death, her single "[[Miss You (2003 song)|Miss You]]" made it to #1 on the [[Billboard Hot 100|US R&B Charts]]. The album earned Aaliyah a [[posthumous]] [[Grammy]] for [[Best Instrumental Arrangment Accompanying Vocals]]
 564+
 565+The follow-up single to "Miss You" was "[[Don't Know What to Tell Ya]]". However it was only released in Europe and peaked at #22 in the UK and #57 in Germany. The "Handcuff Remix" became popular among fans who had bought the single. The third and final Single released (second in the U.S.) was [[Come Over (Aaliyah song)|Come Over]]. The single had moderate pop success peaking in the top 40 of The Hot 100 at #32. It did a lot better on the R&B charts becoming a top 10 hit peaking at #9.
 566+
 567+Shortly after, the "Greatest Hits : Special Fan Box" [http://www.amazon.co.uk/exec/obidos/ASIN/B0001GYH2A/ref=ord_cart_shr/202-2194674-4915813] was released. It featured re-packaged versions of the albums "One In A Million", "Aaliyah" and "I Care 4 U". It also featured a DVD containing all of Aaliyah's music videos. It was all packaged in a special box.
 568+
 569+==Aaliyah in the mid-2000s==
 570+In early/mid-2005, four previously unreleased Aaliyah tracks were leaked to the Internet: a cover of [[Gladys Knight & the Pips|Gladys Knight & the Pips']] "Giving Up", "Where Could He Be" featuring Missy Elliot and Tweet (which was sent to radio stations), "Steady Ground" featuring Static from Playa, and a duet with Digital Black from Playa entitled "Don't Think They Know". In January 2006, a new unreleased Aaliyah track was leaked to the Internet. Entitled "Time", it was a snippet of an unfinished song and was produced by Timbaland (Sample of this track can be found on YouTube) Buzz of a song titled "Girlfriends" has been brewing for years now since the death of Aaliyah, until recently [[Yaushameen Michael]] posted the song on her Myspace, a Duet with the late R&B Princess. There are also many other rumored unreleased tracks such as "Did You Hear", "Dont Think They Know" feat. Digital Black, "Forever in My Heart", and "Candy".
 571+
 572+==Merchandise and the Aaliyah Charity Fund==
 573+Aaliyah's official website features items such as t-shirts with Aaliyah's name on them. She has had a calendar with her pictures since 2002. In 2007, Aaliyah's mother Diane Haughton and former manager Paul Allcata hired branding and licensing agency Wicked Cow Entertainment to grow the Aaliyah licensing program. Plans are currently underway for an apparel and accessories line.<ref>{{cite|The Licensing Letter|epmcom.com|title=Properties Available for Licensing|publisher=EPM|author=The Licensing Letter|date=2007-04-12}}</ref>
 574+
 575+==Discography==
 576+{{further|[[Aaliyah discography]]}}
 577+
 578+===Albums===
 579+{| class="wikitable"
 580+! width=100| Year
 581+! width=200| Album Title
 582+! width=100| U.S.
 583+! width=100| Worldwide
 584+! width=100| U.S. Charts
 585+|- align="center"
 586+| 1994 || ''[[Age Ain't Nothing but a Number]]'' || 3.5 million || 7 million || 18
 587+|- align="center"
 588+| 1996 || ''[[One in a Million (album)|One in a Million]]'' || 3.7 million || 11 million {{fact}} || 18
 589+|- align="center"
 590+| 2001 || ''[[Aaliyah (album)|Aaliyah]]'' || 2.7 million || 8 million || 1
 591+|- align="center"
 592+| 2002 || ''[[I Care 4 U]]'' || 1.5 million || 6 million || 3
 593+|- align="center"
 594+| 2005 || ''[[Ultimate Aaliyah]]'' || 0.2 million || 2 million{{fact}} || -
 595+|-
 596+! colspan=2 | Total || 11.6 million || 34 million ||
 597+|}
 598+
 599+=== Number-one singles ===
 600+<!-- If it doesn't enter the singles chart, airplay charts are allowed to be putted. -->
 601+
 602+{{dablink|The following singles reached number one in the [[United States|U.S.]], the [[Hot R&B/Hip-Hop Songs|U.S. R&B]], the [[United Kingdom]] and the [[New Zealand]]. It also includes its peak in the [[United World Chart]]}}
 603+{| class="wikitable"
 604+|- bgcolor="#CCCCCC"
 605+!align="center" rowspan="2" | Year
 606+!align="center" rowspan="2" | Single
 607+!align="center" colspan="8" | Peak positions
 608+|- bgcolor="#FFFFFF"
 609+! width="60"|<small>US</small>
 610+! width="60"|<small>US R&B</small>
 611+! width="60"|<small>UK</small>
 612+! width="60"|<small>NZ</small>
 613+! width="60"|<small>United World Chart</small>
 614+|-
 615+|align="center" rowspan="1"|1994
 616+|align="left"|"[[Back and Forth (song)|Back and Forth]]"
 617+|align="center"|5
 618+|align="center"|'''1'''
 619+|align="center"|16
 620+|align="center"|18
 621+|align="center"|-
 622+|-
 623+|align="center" rowspan="2"|1996
 624+|align="left"|"[[If Your Girl Only Knew]]"
 625+|align="center"|11
 626+|align="center"|'''1'''
 627+|align="center"|15
 628+|align="center"|-
 629+|align="center"|-
 630+|-
 631+|align="left"|"[[One in a Million (Aaliyah song)|One in a Million]]"
 632+|align="center"|25
 633+|align="center"|'''1'''<sup>1<sup>
 634+|align="center"|15
 635+|align="center"|-
 636+|align="center"|-
 637+|-
 638+|align="center" rowspan="1"|1998
 639+|align="left"|"[[Are You That Somebody?]]"
 640+|align="center"|10
 641+|align="center"|'''1'''<sup>1<sup>
 642+|align="center"|11
 643+|align="center"|'''1'''
 644+|align="center"|-
 645+|-
 646+|align="center" rowspan="1"|2000
 647+|align="left"|"[[Try Again]]"
 648+|align="center"|'''1'''
 649+|align="center"|4
 650+|align="center"|5
 651+|align="center"|13
 652+|align="center"|4
 653+|-
 654+|align="center" rowspan="1"|2002
 655+|align="left"|"[[More Than a Woman (Aaliyah song)|More Than a Woman]]"
 656+|align="center"|25
 657+|align="center"|7
 658+|align="center"|'''1'''
 659+|align="center"|-
 660+|align="center"|37
 661+|-
 662+|align="center" rowspan="1"|2003
 663+|align="left"|"[[Miss You (Aaliyah song)|Miss You]]"
 664+|align="center"|3
 665+|align="center"|'''1'''
 666+|align="center"|7
 667+|align="center"|-
 668+|align="center"|29
 669+|-
 670+|align="center"|
 671+!align="center"|Total number-one singles
 672+|align="center"|'''1'''
 673+|align="center"|'''5'''
 674+|align="center"|'''1'''
 675+|align="center"|'''1'''
 676+|align="center"|-
 677+|}
 678+
 679+*Notes:
 680+<sup>1</sup> *Topped in the [[Hot R&B/Hip-Hop Airplay]]
 681+
 682+==Awards==
 683+This is a list of awards for which Aaliyah was nominated during her career.
 684+
 685+===1995===
 686+*1995 Nominated for an American Music Award for Favorite Soul/R&B New Artist
 687+
 688+*1995 Best R&B Female Vocal Performance for ''At Your Best'': Nominated
 689+
 690+* Nominated for Two MTV VMA's: ''Best New Artist in Video'' and '' Best R&B Video '' both for At Your Best
 691+
 692+* Nominated for three World Music Award: '' Worlds Best Selling Female Artist'', Worlds Best Selling New Artist'' and'' Worlds Best Selling R&B Artist''
 693+
 694+* Nominated for eight Billboard Music Awards: Best New R&B Artist, Best New Artist, Female Artist of the Year, Best R&B/Hip Hop single, Best Female R&B Single, Best Female Pop Single, Best Hip Hip/R&B Artist, Best R&B/Hip Hop Album
 695+
 696+* Source Awards: Best New Artist and Best Female R&B Artist '''WON'''
 697+
 698+===1999===
 699+* 1999 Nominated for two MTV Video Music Awards: R&B Video ("Are You That Somebody?"), Best Video from a Film ("Are You That Somebody")
 700+
 701+* 1999 Nominated for an American Music Award for Favorite Soul/R&B Female Artist.
 702+
 703+* 1999 Nominated for an NAACP Image Award for Outstanding Music Video ("Are You That Somebody?")
 704+
 705+* 1999 Nominated for two Soul Train Lady of Soul Awards for Best R&B/Soul Song and Best R&B/Soul or Rap Music Video ("Are You That Somebody?").
 706+
 707+*1999 Nominated for a Grammy Award for Best Female R&B Vocal Performance ("Are You That Someobdy?")
 708+
 709+*1999 Nominated for a [[Academy Award]] Best Original Song for: Journey To The Past
 710+
 711+===2000===
 712+*2000 Nominated for two Soul Train Lady of Soul Awards for Best RnB / Soul Single - Solo and Best RnB / Soul or Rap Music Video ("Try Again")
 713+* 2000 '''Won''' two MTV Video Music Awards for Best Female Video, Best Video from a Film ("Try Again")
 714+* 2000 Nominated for MTV Europe Music Video Award for Best RnB video
 715+*2000 Nominated for Radio Music Award Urban song of the year and Urban artist of the year.
 716+*2000 Nominated My VH1 music award nominee for double threat (Musicians-Actors) award.
 717+
 718+===2001===
 719+*2001 Nominated for a Grammy Award for Best Female R&B Vocal Performance ("Try Again")
 720+
 721+===2002===
 722+*2002 '''Won''' two American Music Awards: Favorite Soul/R&B Female Artist and Favorite Soul/R&B Album.
 723+*2002 Nominated for two Grammy Awards for Best Female R&B Vocal Performance ("Rock The Boat") and Best R&B Album ("Aaliyah")
 724+*2002 '''Won''' a Soul Train Award for R&B/Soul Single; Female ("Rock The Boat")
 725+*2002 '''Won''' the Best R&B / Soul Single, Solo Award and R&B/Soul or Rap Song of the Year at the Soul Train Lady of Soul Awards (for "Rock The Boat")
 726+*2002 Nominated for an MTV Video Music Award for Best R&B Video ("Rock The Boat")
 727+===2003===
 728+*2003 '''Won''' Source Awards: Best Female R&B Artist ''I Care 4 U''
 729+
 730+==Filmography==
 731+*''[[Romeo Must Die]]'' (2000) - Trish O'Day
 732+*''[[Queen of the Damned (film)|Queen of the Damned]]'' (2002) - Akasha
 733+
 734+===Unfinished films===
 735+
 736+*''[[The Matrix Reloaded]]'' (2003) (replaced by [[Nona Gaye]])
 737+*''[[The Matrix Revolutions]]'' (2003) (replaced by [[Nona Gaye]])
 738+*''[[Honey (2003 film)|Honey]]'' (2003) (replaced by [[Jessica Alba]])
 739+*''[[Sparkle]]'' (replaced by [[Raven-Symoné]])
 740+
 741+Aaliyah was [http://www.notstarring.com/actors/aaliyah considered to play a part in other films], including:
 742+
 743+*''[[Charlie's Angels (film)|Charlie's Angels]]'' (2000)
 744+*''[[Get Over It (film)|Get Over It]]'' (2001)
 745+*''[[Josie and the Pussycats (film)|Josie and the Pussycats]]'' (2001)
 746+*''[[Osmosis Jones]]'' (2001)
 747+
 748+==See also==
 749+*[[Blackground Records]]
 750+*[[Missy Elliott]]
 751+*[[Steve "Static" Garrett]]
 752+*[[Rashad Haughton]]
 753+*[[Swing Mob]]
 754+*[[Timbaland]]
 755+
 756+==References==
 757+{{reflist|2}}
 758+
 759+==Links==
 760+*[http://www.Aaliyah.com Official site]
 761+*[http://www.aaliyahicare4u.com Artist Website]
 762+*[http://www.myspace.com/aaliyah Aaliyah's Official Myspace Page]
 763+*{{imdb name|id=0004691|name=Aaliyah}}
 764+*{{nndb name|id=742/000024670|name=Aaliyah}}
 765+*[http://www.billboard.com/bbcom/bio/index.jsp?pid=36610&cr=artist&or=ASCENDING&sf=length&kw=aaliyah Aaliyah bio on Billboard]
 766+*[http://www.hibblenradio.com/2001-08-CBS-AaliyahCrash.mp3 MP3 of CBS News, Radio reports on crash from Abaco Island]
 767+
 768+<!-- PLEASE DO NOT ADD FANSITES -->
 769+
 770+{{Aaliyah}}
 771+
 772+[[Category:Aaliyah| ]]
 773+[[Category:American actor-singers]]
 774+[[Category:American pop singers]]
 775+[[Category:American dance musicians]]
 776+[[Category:American Roman Catholics]]
 777+[[Category:American rhythm and blues singers]]
 778+[[Category:American female singers]]
 779+[[Category:American film actors]]
 780+[[Category:American dancers]]
 781+[[Category:African-American singers]]
 782+[[Category:African-American actors]]
 783+[[Category:Michigan musicians]]
 784+[[Category:Swing Mob artists]]
 785+[[Category:People from Brooklyn]]
 786+[[Category:People from Detroit]]
 787+[[Category:Plane crash victims]]
 788+[[Category:1979 births]]
 789+[[Category:2001 deaths]]
 790+
 791+[[cs:Aaliyah]]
 792+[[de:Aaliyah]]
 793+[[et:Aaliyah]]
 794+[[es:Aaliyah]]
 795+[[fa:عالیه]]
 796+[[fr:Aaliyah]]
 797+[[gl:Aaliyah]]
 798+[[it:Aaliyah]]
 799+[[lt:Aaliyah]]
 800+[[ms:Aaliyah]]
 801+[[nl:Aaliyah]]
 802+[[ja:アリーヤ]]
 803+[[no:Aaliyah]]
 804+[[pl:Aaliyah]]
 805+[[pt:Aaliyah]]
 806+[[ru:Аалия]]
 807+[[simple:Aaliyah]]
 808+[[fi:Aaliyah]]
 809+[[sv:Aaliyah]]
 810+[[tl:Aaliyah]]
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/CustomBoostQuery.java
@@ -0,0 +1,351 @@
 2+package org.apache.lucene.search;
 3+
 4+import org.apache.lucene.search.*;
 5+
 6+/**
 7+ * Licensed to the Apache Software Foundation (ASF) under one or more
 8+ * contributor license agreements. See the NOTICE file distributed with
 9+ * this work for additional information regarding copyright ownership.
 10+ * The ASF licenses this file to You under the Apache License, Version 2.0
 11+ * (the "License"); you may not use this file except in compliance with
 12+ * the License. You may obtain a copy of the License at
 13+ *
 14+ * http://www.apache.org/licenses/LICENSE-2.0
 15+ *
 16+ * Unless required by applicable law or agreed to in writing, software
 17+ * distributed under the License is distributed on an "AS IS" BASIS,
 18+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19+ * See the License for the specific language governing permissions and
 20+ * limitations under the License.
 21+ */
 22+
 23+import java.io.IOException;
 24+import java.util.Set;
 25+
 26+import org.apache.lucene.index.IndexReader;
 27+import org.apache.lucene.search.ComplexExplanation;
 28+import org.apache.lucene.search.Explanation;
 29+import org.apache.lucene.search.Query;
 30+import org.apache.lucene.search.Scorer;
 31+import org.apache.lucene.search.Searcher;
 32+import org.apache.lucene.search.Similarity;
 33+import org.apache.lucene.search.Weight;
 34+import org.apache.lucene.util.ToStringUtils;
 35+
 36+/**
 37+ * Query that sets document score as a programmatic function of (up to) two (sub) scores.
 38+ * <ol>
 39+ * <li>the score of its subQuery (any query)</li>
 40+ * <li>(optional) the score of its boosting Query,
 41+ * for most simple/convineient use case this query would be a
 42+ * {@link org.apache.lucene.search.function.FieldScoreQuery FieldScoreQuery}</li>
 43+ * </ol>
 44+ * Subclasses can modify the computation by overriding {@link #customScore(int, float, float)}.
 45+ *
 46+ * Note: documents will only match based on the first sub scorer.
 47+ *
 48+ * <p><font color="#FF0000">
 49+ * WARNING: The status of the <b>search.function</b> package is experimental.
 50+ * The APIs introduced here might change in the future and will not be
 51+ * supported anymore in such a case.</font>
 52+ */
 53+public class CustomBoostQuery extends Query {
 54+
 55+ private Query subQuery;
 56+ private Query boostQuery; // optional, can be null
 57+ private boolean strict = false; // if true, boosting part of query does not take part in weights normalization.
 58+
 59+ /**
 60+ * Create a CustomBoostQuery over input subQuery.
 61+ * @param subQuery the sub query whose scored is being customed. Must not be null.
 62+ */
 63+ public CustomBoostQuery(Query subQuery) {
 64+ this(subQuery,null);
 65+ }
 66+
 67+ /**
 68+ * Create a CustomBoostQuery over input subQuery and a {@link Query}.
 69+ * @param subQuery the sub query whose score is being customed. Must not be null.
 70+ * @param boostQuery a value source query whose scores are used in the custom score
 71+ * computation. For most simple/convineient use case this would be a
 72+ * {@link org.apache.lucene.search.function.FieldScoreQuery FieldScoreQuery}.
 73+ * This parameter is optional - it can be null.
 74+ */
 75+ public CustomBoostQuery(Query subQuery, Query boostQuery) {
 76+ super();
 77+ this.subQuery = subQuery;
 78+ this.boostQuery = boostQuery;
 79+ if (subQuery == null) throw new IllegalArgumentException("<subqyery> must not be null!");
 80+ }
 81+
 82+ /*(non-Javadoc) @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) */
 83+ public Query rewrite(IndexReader reader) throws IOException {
 84+ subQuery = subQuery.rewrite(reader);
 85+ if (boostQuery!=null) {
 86+ boostQuery = (Query) boostQuery.rewrite(reader);
 87+ }
 88+ return this;
 89+ }
 90+
 91+ /*(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(java.util.Set) */
 92+ public void extractTerms(Set terms) {
 93+ subQuery.extractTerms(terms);
 94+ if (boostQuery!=null) {
 95+ boostQuery.extractTerms(terms);
 96+ }
 97+ }
 98+
 99+ /*(non-Javadoc) @see org.apache.lucene.search.Query#clone() */
 100+ public Object clone() {
 101+ CustomBoostQuery clone = (CustomBoostQuery)super.clone();
 102+ clone.subQuery = (Query) subQuery.clone();
 103+ if (boostQuery!=null) {
 104+ clone.boostQuery = (Query) boostQuery.clone();
 105+ }
 106+ return clone;
 107+ }
 108+
 109+ /* (non-Javadoc) @see org.apache.lucene.search.Query#toString(java.lang.String) */
 110+ public String toString(String field) {
 111+ StringBuffer sb = new StringBuffer(name()).append("(");
 112+ sb.append(subQuery.toString(field));
 113+ if (boostQuery!=null) {
 114+ sb.append(", ").append(boostQuery.toString(field));
 115+ }
 116+ sb.append(")");
 117+ sb.append(strict?" STRICT" : "");
 118+ return sb.toString() + ToStringUtils.boost(getBoost());
 119+ }
 120+
 121+ /** Returns true if <code>o</code> is equal to this. */
 122+ public boolean equals(Object o) {
 123+ if (getClass() != o.getClass()) {
 124+ return false;
 125+ }
 126+ CustomBoostQuery other = (CustomBoostQuery)o;
 127+ return this.getBoost() == other.getBoost()
 128+ && this.subQuery.equals(other.subQuery)
 129+ && (this.boostQuery==null ? other.boostQuery==null
 130+ : this.boostQuery.equals(other.boostQuery));
 131+ }
 132+
 133+ /** Returns a hash code value for this object. */
 134+ public int hashCode() {
 135+ int boostHash = boostQuery==null ? 0 : boostQuery.hashCode();
 136+ return (getClass().hashCode() + subQuery.hashCode() + boostHash) ^ Float.floatToIntBits(getBoost());
 137+ }
 138+
 139+ /**
 140+ * Compute a custom score by the subQuery score and the Query score.
 141+ * <p>
 142+ * Subclasses can override this method to modify the custom score.
 143+ * <p>
 144+ * The default computation herein is:
 145+ * <pre>
 146+ * ModifiedScore = boostScore * subQueryScore.
 147+ * </pre>
 148+ *
 149+ * @param doc id of scored doc.
 150+ * @param subQueryScore score of that doc by the subQuery.
 151+ * @param boostScore score of that doc by the Query.
 152+ * @return custom score.
 153+ */
 154+ public float customScore(int doc, float subQueryScore, float boostScore) {
 155+ return (0.2f + boostScore * 10) * subQueryScore;
 156+ }
 157+
 158+ /**
 159+ * Explain the custom score.
 160+ * Whenever overriding {@link #customScore(int, float, float)},
 161+ * this method should also be overriden to provide the correct explanation
 162+ * for the part of the custom scoring.
 163+ * @param doc doc being explained.
 164+ * @param subQueryExpl explanation for the sub-query part.
 165+ * @param boostExpl explanation for the value source part.
 166+ * @return an explanation for the custom score
 167+ */
 168+ public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation boostExpl) {
 169+ float boostScore = boostExpl==null ? 1 : boostExpl.getValue();
 170+ float sc = (0.2f + boostScore * 10);
 171+ Explanation exp = new Explanation( sc * subQueryExpl.getValue(), "custom score: product of:");
 172+ exp.addDetail(subQueryExpl);
 173+ if (boostExpl != null) {
 174+ exp.addDetail(boostExpl);
 175+ }
 176+ return exp;
 177+ }
 178+ //=========================== W E I G H T ============================
 179+
 180+ private class CustomWeight implements Weight {
 181+ Searcher searcher;
 182+ Weight subQueryWeight;
 183+ Weight boostWeight; // optional
 184+ boolean qStrict;
 185+
 186+ public CustomWeight(Searcher searcher) throws IOException {
 187+ this.searcher = searcher;
 188+ this.subQueryWeight = subQuery.weight(searcher);
 189+ if (boostQuery!=null) {
 190+ this.boostWeight = boostQuery.createWeight(searcher);
 191+ }
 192+ this.qStrict = strict;
 193+ }
 194+
 195+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#getQuery() */
 196+ public Query getQuery() {
 197+ return CustomBoostQuery.this;
 198+ }
 199+
 200+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() */
 201+ public float getValue() {
 202+ return getBoost();
 203+ }
 204+
 205+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() */
 206+ public float sumOfSquaredWeights() throws IOException {
 207+ float sum = subQueryWeight.sumOfSquaredWeights();
 208+ if (boostWeight!=null) {
 209+ if (qStrict) {
 210+ boostWeight.sumOfSquaredWeights(); // do not include ValueSource part in the query normalization
 211+ } else {
 212+ sum += boostWeight.sumOfSquaredWeights();
 213+ }
 214+ }
 215+ sum *= getBoost() * getBoost(); // boost each sub-weight
 216+ return sum ;
 217+ }
 218+
 219+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) */
 220+ public void normalize(float norm) {
 221+ norm *= getBoost(); // incorporate boost
 222+ subQueryWeight.normalize(norm);
 223+ if (boostWeight!=null) {
 224+ if (qStrict) {
 225+ boostWeight.normalize(1); // do not normalize the ValueSource part
 226+ } else {
 227+ boostWeight.normalize(norm);
 228+ }
 229+ }
 230+ }
 231+
 232+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#scorer(org.apache.lucene.index.IndexReader) */
 233+ public Scorer scorer(IndexReader reader) throws IOException {
 234+ Scorer subQueryScorer = subQueryWeight.scorer(reader);
 235+ Scorer boostScorer = (boostWeight==null ? null : boostWeight.scorer(reader));
 236+ return new CustomScorer(getSimilarity(searcher), reader, this, subQueryScorer, boostScorer);
 237+ }
 238+
 239+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#explain(org.apache.lucene.index.IndexReader, int) */
 240+ public Explanation explain(IndexReader reader, int doc) throws IOException {
 241+ return scorer(reader).explain(doc);
 242+ }
 243+ }
 244+
 245+
 246+ //=========================== S C O R E R ============================
 247+
 248+ /**
 249+ * A scorer that applies a (callback) function on scores of the subQuery.
 250+ */
 251+ private class CustomScorer extends Scorer {
 252+ private final CustomWeight weight;
 253+ private final float qWeight;
 254+ private Scorer subQueryScorer;
 255+ private Scorer boostScorer; // optional
 256+ private IndexReader reader;
 257+
 258+ // constructor
 259+ private CustomScorer(Similarity similarity, IndexReader reader, CustomWeight w,
 260+ Scorer subQueryScorer, Scorer boostScorer) throws IOException {
 261+ super(similarity);
 262+ this.weight = w;
 263+ this.qWeight = w.getValue();
 264+ this.subQueryScorer = subQueryScorer;
 265+ this.boostScorer = boostScorer;
 266+ this.reader = reader;
 267+ }
 268+
 269+ /*(non-Javadoc) @see org.apache.lucene.search.Scorer#next() */
 270+ public boolean next() throws IOException {
 271+ boolean hasNext = subQueryScorer.next();
 272+ if (boostScorer!=null && hasNext) {
 273+ boostScorer.skipTo(subQueryScorer.doc());
 274+ }
 275+ return hasNext;
 276+ }
 277+
 278+ /*(non-Javadoc) @see org.apache.lucene.search.Scorer#doc() */
 279+ public int doc() {
 280+ return subQueryScorer.doc();
 281+ }
 282+
 283+ /*(non-Javadoc) @see org.apache.lucene.search.Scorer#score() */
 284+ public float score() throws IOException {
 285+ float boostScore = (boostScorer==null || subQueryScorer.doc() != boostScorer.doc() ? 0 : boostScorer.score());
 286+ return qWeight * customScore(subQueryScorer.doc(), subQueryScorer.score(), boostScore);
 287+ }
 288+
 289+ /*(non-Javadoc) @see org.apache.lucene.search.Scorer#skipTo(int) */
 290+ public boolean skipTo(int target) throws IOException {
 291+ boolean hasNext = subQueryScorer.skipTo(target);
 292+ if (boostScorer!=null && hasNext) {
 293+ boostScorer.skipTo(subQueryScorer.doc());
 294+ }
 295+ return hasNext;
 296+ }
 297+
 298+ /*(non-Javadoc) @see org.apache.lucene.search.Scorer#explain(int) */
 299+ public Explanation explain(int doc) throws IOException {
 300+ Explanation subQueryExpl = weight.subQueryWeight.explain(reader,doc);
 301+ if (!subQueryExpl.isMatch()) {
 302+ return subQueryExpl;
 303+ }
 304+ // match
 305+ Explanation boostExpl = boostScorer==null ? null :
 306+ weight.qStrict ? boostScorer.explain(doc) : weight.boostWeight.explain(reader,doc);
 307+ Explanation customExp = customExplain(doc,subQueryExpl,boostExpl);
 308+ float sc = qWeight * customExp.getValue();
 309+ Explanation res = new ComplexExplanation(
 310+ true, sc, CustomBoostQuery.this.toString() + ", product of:");
 311+ res.addDetail(customExp);
 312+ res.addDetail(new Explanation(qWeight, "queryBoost")); // actually using the q boost as q weight (== weight value)
 313+ return res;
 314+ }
 315+ }
 316+
 317+ /*(non-Javadoc) @see org.apache.lucene.search.Query#createWeight(org.apache.lucene.search.Searcher) */
 318+ protected Weight createWeight(Searcher searcher) throws IOException {
 319+ return new CustomWeight(searcher);
 320+ }
 321+
 322+ /**
 323+ * Checks if this is strict custom scoring.
 324+ * In strict custom scoring, the ValueSource part of does not participate in weight normalization.
 325+ * This may be useful when one wants full control over how scores are modified, and does
 326+ * not care about normalizing by the ValueSource part.
 327+ * One particular case where this is useful if for testing this query.
 328+ * <P>
 329+ * Note: only has effect when the ValueSource part is not null.
 330+ */
 331+ public boolean isStrict() {
 332+ return strict;
 333+ }
 334+
 335+ /**
 336+ * Set the strict mode of this query.
 337+ * @param strict The strict mode to set.
 338+ * @see #isStrict()
 339+ */
 340+ public void setStrict(boolean strict) {
 341+ this.strict = strict;
 342+ }
 343+
 344+ /**
 345+ * A short name of this query, used in {@link #toString(String)}.
 346+ */
 347+ public String name() {
 348+ return "custom";
 349+ }
 350+
 351+}
 352+
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/ResultSet.java
@@ -1,6 +1,8 @@
22 package org.wikimedia.lsearch.beans;
33
44 import java.io.Serializable;
 5+import java.util.ArrayList;
 6+import java.util.Collection;
57
68 import org.apache.lucene.search.Explanation;
79
@@ -9,6 +11,7 @@
1012 public double score;
1113 public String namespace;
1214 public String title;
 15+ public ArrayList<String> context;
1316 Explanation explanation;
1417
1518 public ResultSet(String key) {
@@ -43,7 +46,25 @@
4447 @Override
4548 public String toString() {
4649 return score+" "+namespace+":"+title+(explanation==null? "" : "\n"+explanation);
47 - }
 50+ }
4851
 52+ public void addContext(Collection<String> texts){
 53+ if(texts == null)
 54+ return;
 55+ for(String t : texts)
 56+ addContext(t);
 57+ }
4958
 59+ public void addContext(String text){
 60+ if(context == null)
 61+ context = new ArrayList<String>();
 62+
 63+ context.add(text.replace('\n',' '));
 64+ }
 65+
 66+ public ArrayList<String> getContext(){
 67+ return context;
 68+ }
 69+
 70+
5071 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Title.java
@@ -17,11 +17,11 @@
1818 }
1919
2020 public Title(String key){
21 - String[] parts = key.split(":",2);
22 - if(parts.length != 2)
 21+ int col = key.indexOf(':');
 22+ if(col == -1)
2323 throw new RuntimeException("Wrong key format in Title constructor");
24 - this.namespace = Integer.parseInt(parts[0]);
25 - this.title = parts[1];
 24+ this.namespace = Integer.parseInt(key.substring(0,col));
 25+ this.title = key.substring(col+1);
2626 }
2727
2828 public String getKey(){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java
@@ -119,7 +119,7 @@
120120 selected.add(sorted.get(i).getKey());
121121 }
122122 Document d = new Document();
123 - d.add(new Field("prefix",prefix,Field.Store.NO,Field.Index.UN_TOKENIZED));
 123+ d.add(new Field("prefix",prefix,Field.Store.NO,Field.Index.NO_NORMS));
124124 d.add(new Field("articles",new StringList(selected).toString(),Field.Store.YES,Field.Index.NO));
125125 writer.addDocument(d);
126126 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java
@@ -38,17 +38,17 @@
3939 Revision revision;
4040 SimpleIndexWriter writer;
4141 int count = 0, limit;
42 - LinkAnalysisStorage las;
 42+ Links links;
4343 String langCode;
4444 RelatedStorage related;
4545
4646 public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor,
47 - Integer maxBufDocs, boolean newIndex, LinkAnalysisStorage las, String langCode){
 47+ Integer maxBufDocs, boolean newIndex, Links links, String langCode){
4848 Configuration.open(); // make sure configuration is loaded
4949 IndexId iid = IndexId.get(dbname);
50 - writer = new SimpleIndexWriter(iid, optimize, mergeFactor, maxBufDocs, newIndex);
 50+ writer = new SimpleIndexWriter(links, iid, optimize, mergeFactor, maxBufDocs, newIndex);
5151 this.limit = limit;
52 - this.las = las;
 52+ this.links = links;
5353 this.langCode = langCode;
5454 this.related = new RelatedStorage(iid);
5555 if(!related.canRead())
@@ -62,26 +62,29 @@
6363 }
6464 public void writeEndPage() throws IOException {
6565 String key = page.Title.Namespace+":"+page.Title.Text;
66 - ArticleAnalytics aa = las.getAnaliticsForArticle(key);
67 - int references = aa.getReferences();
68 - boolean isRedirect = aa.isRedirect();
69 - int redirectTargetNamespace = aa.getRedirectTargetNamespace();
 66+ int references = links.getNumInLinks(key);
 67+ boolean isRedirect = links.isRedirect(key);
 68+ int redirectTargetNamespace = isRedirect? links.getRedirectTargetNamespace(key) : -1;
7069
7170 // make list of redirects
7271 ArrayList<Redirect> redirects = new ArrayList<Redirect>();
7372 ArrayList<String> anchors = new ArrayList<String>();
74 - anchors.addAll(aa.getAnchorText());
75 - for(String rk : aa.getRedirectKeys()){
 73+ //anchors.addAll(aa.getAnchorText());
 74+ for(String rk : links.getRedirectsTo(key)){
7675 String[] parts = rk.toString().split(":",2);
77 - ArticleAnalytics raa = las.getAnaliticsForReferences(rk);
78 - redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],raa.getReferences()));
79 - anchors.addAll(raa.getAnchorText());
 76+ int redirectRef = links.getNumInLinks(rk);
 77+ redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],redirectRef));
 78+ //anchors.addAll(raa.getAnchorText());
8079 }
8180 ArrayList<RelatedTitle> rel = null;
8281 if(related != null)
8382 rel = related.getRelated(key);
8483 else
8584 rel = new ArrayList<RelatedTitle>();
 85+ // extract contexts
 86+ /*for(RelatedTitle t : rel){
 87+ links.getContext(t.getRelated().getKey(),key);
 88+ } */
8689 // make article
8790 Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,
8891 references,redirectTargetNamespace,redirects,rel,anchors);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
@@ -17,6 +17,7 @@
1818 import org.wikimedia.lsearch.index.IndexUpdateRecord;
1919 import org.wikimedia.lsearch.index.WikiIndexModifier;
2020 import org.wikimedia.lsearch.index.WikiSimilarity;
 21+import org.wikimedia.lsearch.ranks.Links;
2122
2223 /**
2324 * IndexWriter for building indexes from scratch.
@@ -33,8 +34,10 @@
3435 protected Integer mergeFactor, maxBufDocs;
3536 protected boolean newIndex;
3637 protected String langCode;
 38+ protected Links links;
3739
38 - public SimpleIndexWriter(IndexId iid, Boolean optimize, Integer mergeFactor, Integer maxBufDocs, boolean newIndex){
 40+ public SimpleIndexWriter(Links links, IndexId iid, Boolean optimize, Integer mergeFactor, Integer maxBufDocs, boolean newIndex){
 41+ this.links = links;
3942 this.iid = iid;
4043 this.optimize = optimize;
4144 this.mergeFactor = mergeFactor;
@@ -43,7 +46,7 @@
4447 GlobalConfiguration global = GlobalConfiguration.getInstance();
4548 langCode = global.getLanguage(iid.getDBname());
4649 FieldBuilder.Case dCase = (global.exactCaseIndex(iid.getDBname()))? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
47 - builder = new FieldBuilder(langCode,dCase);
 50+ builder = new FieldBuilder(iid,dCase);
4851 indexes = new HashMap<String,IndexWriter>();
4952 // open all relevant indexes
5053 if(iid.isSingle())
@@ -109,7 +112,7 @@
110113 IndexWriter writer = indexes.get(target.toString());
111114 if(writer == null)
112115 return;
113 - Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid);
 116+ Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid,links);
114117 Document doc = (Document) ret[0];
115118 Analyzer analyzer = (Analyzer) ret[1];
116119 try {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java
@@ -20,6 +20,7 @@
2121 import org.wikimedia.lsearch.ranks.Links;
2222 import org.wikimedia.lsearch.ranks.RankBuilder;
2323 import org.wikimedia.lsearch.related.CompactLinks;
 24+import org.wikimedia.lsearch.related.RelatedBuilder;
2425 import org.wikimedia.lsearch.storage.LinkAnalysisStorage;
2526 import org.wikimedia.lsearch.storage.Storage;
2627 import org.wikimedia.lsearch.util.Localization;
@@ -45,6 +46,7 @@
4647 Integer mergeFactor = null, maxBufDocs = null;
4748 boolean newIndex = true, makeSnapshot = false;
4849 boolean snapshotDb = false, useOldLinkAnalysis = false;
 50+ boolean useOldRelated = false;
4951
5052 System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n");
5153
@@ -52,12 +54,13 @@
5355 log = Logger.getLogger(Importer.class);
5456
5557 if(args.length < 2){
56 - System.out.println("Syntax: java Importer [-a] [-n] [-s] [-la] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");
 58+ System.out.println("Syntax: java Importer [-a] [-n] [-s] [-l] [-r] [-lm limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");
5759 System.out.println("Options: ");
5860 System.out.println(" -a - don't create new index, append to old");
5961 System.out.println(" -s - make index snapshot when finished");
60 - System.out.println(" -la - use earlier link analysis index, don't recalculate");
61 - System.out.println(" -l limit_num - add at most limit_num articles");
 62+ System.out.println(" -l - use earlier link analysis index, don't recalculate");
 63+ System.out.println(" -r - use earlier related index, don't recalculate");
 64+ System.out.println(" -lm limit_num - add at most limit_num articles");
6265 System.out.println(" -o optimize - true/false overrides optimization param from global settings");
6366 System.out.println(" -m mergeFactor - overrides param from global settings");
6467 System.out.println(" -b maxBufDocs - overrides param from global settings");
@@ -65,7 +68,7 @@
6669 return;
6770 }
6871 for(int i=0;i<args.length;i++){
69 - if(args[i].equals("-l"))
 72+ if(args[i].equals("-lm"))
7073 limit = Integer.parseInt(args[++i]);
7174 else if(args[i].equals("-o"))
7275 optimize = Boolean.parseBoolean(args[++i]);
@@ -75,8 +78,10 @@
7679 maxBufDocs = Integer.parseInt(args[++i]);
7780 else if(args[i].equals("-a"))
7881 newIndex = false;
79 - else if(args[i].equals("-la"))
 82+ else if(args[i].equals("-l"))
8083 useOldLinkAnalysis = true;
 84+ else if(args[i].equals("-r"))
 85+ useOldRelated = true;
8186 else if(args[i].equals("-s"))
8287 makeSnapshot = true;
8388 else if(args[i].equals("--snapshot")){
@@ -106,17 +111,23 @@
107112 long start = System.currentTimeMillis();
108113
109114 if(!useOldLinkAnalysis){
110 - // regenerate link and redirect information
111 - Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode,iid),langCode);
 115+ // regenerate link and redirect information
112116 try {
113 - RankBuilder.storeLinkAnalysis(links,iid);
 117+ RankBuilder.processLinks(inputfile,Links.createNew(iid),iid,langCode);
114118 } catch (IOException e) {
115119 log.fatal("Cannot store link analytics: "+e.getMessage());
116120 return;
117121 }
118122 }
119 - log.info("Third pass, indexing articles...");
120 -
 123+ if(!useOldRelated){
 124+ try {
 125+ RelatedBuilder.rebuildFromLinks(iid);
 126+ } catch (IOException e) {
 127+ log.fatal("Cannot make related mapping: "+e.getMessage());
 128+ return;
 129+ }
 130+ }
 131+
121132 // open
122133 InputStream input = null;
123134 try {
@@ -124,31 +135,29 @@
125136 } catch (IOException e) {
126137 log.fatal("I/O error opening "+inputfile);
127138 return;
128 - }
129 - LinkAnalysisStorage las = new LinkAnalysisStorage(iid);
130 - // read
131 - DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,las,langCode);
132 - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 1000));
 139+ }
 140+ long end = start;
133141 try {
 142+ log.info("Indexing articles...");
 143+ IndexId ll = iid.getLinks();
 144+ Links links = Links.openForRead(ll,ll.getImportPath());
 145+ // read
 146+ DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,links,langCode);
 147+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 1000));
134148 reader.readDump();
 149+ log.info("Closing/optimizing index...");
 150+ dp.closeIndex();
 151+ end = System.currentTimeMillis();
 152+ System.out.println("Cache stats: "+links.getCache().getStats());
135153 } catch (IOException e) {
136154 if(!e.getMessage().equals("stopped")){
137 - log.fatal("I/O error reading dump for "+dbname+" from "+inputfile);
 155+ log.fatal("I/O error processing dump for "+dbname+" from "+inputfile+" : "+e.getMessage());
 156+ e.printStackTrace();
138157 return;
139158 }
140 - }
141 -
142 - long end = System.currentTimeMillis();
143 -
144 - log.info("Closing/optimizing index...");
145 - try{
146 - dp.closeIndex();
147 - } catch(IOException e){
148 - e.printStackTrace();
149 - log.fatal("Cannot close/optimize index : "+e.getMessage());
150159 System.exit(1);
151160 }
152 -
 161+
153162 long finalEnd = System.currentTimeMillis();
154163
155164 System.out.println("Finished indexing in "+formatTime(end-start)+", with final index optimization in "+formatTime(finalEnd-end));
@@ -168,6 +177,16 @@
169178 } else
170179 IndexThread.makeIndexSnapshot(iid,iid.getImportPath());
171180 }
 181+
 182+ // some cache stats
 183+ /*Cache cache = CacheManager.create().getCache("links");
 184+ Statistics s = cache.getStatistics();
 185+
 186+ long hit = s.getCacheHits();
 187+ long miss = s.getCacheMisses();
 188+
 189+ System.out.println("Cache stats: hits = "+hit+", miss = "+miss); */
 190+
172191 }
173192
174193 private static String formatTime(long l) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/RelatedStorage.java
@@ -35,7 +35,7 @@
3636 StringList sl = new StringList(CompactRelated.convertToStringList(rel));
3737 Document doc = new Document();
3838 doc.add(new Field("key",key,Field.Store.YES,Field.Index.UN_TOKENIZED));
39 - doc.add(new Field("related",sl.toString(),Field.Store.YES,Field.Index.NO));
 39+ doc.add(new Field("related",sl.toString(),Field.Store.COMPRESS,Field.Index.NO));
4040 writer.addDocument(doc);
4141 }
4242
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java
@@ -32,17 +32,20 @@
3333 * @author rainman
3434 *
3535 */
 36+@Deprecated
3637 public class LinkAnalysisStorage extends LuceneStorage {
3738 static Logger log = Logger.getLogger(LinkAnalysisStorage.class);
3839 protected SetBasedFieldSelector selRef;
3940
4041 public LinkAnalysisStorage(IndexId iid){
41 - super(iid.getLinkAnalysis());
 42+ //super(iid.getLinkAnalysis());
 43+ super(iid);
4244 init();
4345 }
4446
4547 public LinkAnalysisStorage(IndexId iid, String path){
46 - super(iid.getLinkAnalysis(),path);
 48+ //super(iid.getLinkAnalysis(),path);
 49+ super(iid,path);
4750 init();
4851 }
4952
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java
@@ -58,7 +58,7 @@
5959 /** If true, this machine is an indexer for this index */
6060 protected boolean myIndex;
6161
62 - protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINK_ANALYSIS, RELATED, PREFIX };
 62+ protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINKS, RELATED, PREFIX, PREFIX_TITLES };
6363
6464 /** Type of index, enumeration */
6565 protected IndexType type;
@@ -95,6 +95,9 @@
9696 protected String OAIRepository;
9797
9898 protected String rsyncSnapshotPath = null;
 99+
 100+ /** language code, e.g. "en" */
 101+ protected String langCode = null;
99102
100103 /**
101104 * Get index Id object given it's string representation, the actual object
@@ -105,7 +108,10 @@
106109 * @return
107110 */
108111 static public IndexId get(String dbrole){
109 - return GlobalConfiguration.getIndexId(dbrole);
 112+ IndexId ret = GlobalConfiguration.getIndexId(dbrole);
 113+ if(ret == null)
 114+ throw new RuntimeException("Index "+dbrole+" doesn't exist");
 115+ return ret;
110116 }
111117
112118 /**
@@ -158,12 +164,14 @@
159165 this.type = IndexType.NSSPLIT;
160166 else if(type.equals("spell"))
161167 this.type = IndexType.SPELL;
162 - else if(type.equals("link_analysis"))
163 - this.type = IndexType.LINK_ANALYSIS;
 168+ else if(type.equals("links"))
 169+ this.type = IndexType.LINKS;
164170 else if(type.equals("related"))
165171 this.type = IndexType.RELATED;
166172 else if(type.equals("prefix"))
167173 this.type = IndexType.PREFIX;
 174+ else if(type.equals("prefix_titles"))
 175+ this.type = IndexType.PREFIX_TITLES;
168176
169177 // parts
170178 String[] parts = dbrole.split("\\.");
@@ -259,9 +267,9 @@
260268 public boolean isSpell(){
261269 return type == IndexType.SPELL;
262270 }
263 - /** If this is the link-analysis index */
264 - public boolean isLinkAnalysis(){
265 - return type == IndexType.LINK_ANALYSIS;
 271+ /** If this is the index storing pagelinks */
 272+ public boolean isLinks(){
 273+ return type == IndexType.LINKS;
266274 }
267275 /** If this is the index storing info about related articles */
268276 public boolean isRelated(){
@@ -271,6 +279,10 @@
272280 public boolean isPrefix(){
273281 return type == IndexType.PREFIX;
274282 }
 283+ /** If this is the index storing titles for the prefix index */
 284+ public boolean isPrefixTitles(){
 285+ return type == IndexType.PREFIX_TITLES;
 286+ }
275287
276288 /** If this is a split index, returns the current part number, e.g. for entest.part4 will return 4 */
277289 public int getPartNum() {
@@ -418,7 +430,7 @@
419431
420432 /** get all hosts that search db this iid belongs to */
421433 public HashSet<String> getDBSearchHosts(){
422 - if(isSingle() || isSpell() || isLinkAnalysis() || isRelated() || isPrefix())
 434+ if(isSingle() || isSpell() || isLinks() || isRelated() || isPrefix() || isPrefixTitles())
423435 return searchHosts;
424436 else{
425437 // add all hosts that search: dbname and all parts
@@ -469,7 +481,7 @@
470482 */
471483 public HashSet<String> getPhysicalIndexes() {
472484 HashSet<String> ret = new HashSet<String>();
473 - if(isSingle() || isSpell() || isLinkAnalysis() || isRelated() || isPrefix())
 485+ if(isSingle() || isSpell() || isLinks() || isRelated() || isPrefix() || isPrefixTitles())
474486 ret.add(dbrole);
475487 else if(isMainsplit() || isSplit() || isNssplit()){
476488 for(String p : splitParts)
@@ -545,9 +557,9 @@
546558 return get(dbname+".spell");
547559 }
548560
549 - /** Get the link analysis iid */
550 - public IndexId getLinkAnalysis() {
551 - return get(dbname+".link_analysis");
 561+ /** Get the pagelinks iid */
 562+ public IndexId getLinks() {
 563+ return get(dbname+".links");
552564 }
553565
554566 /** Get the related-articles index iid */
@@ -560,6 +572,17 @@
561573 return get(dbname+".prefix");
562574 }
563575
 576+ /** Get the prefix titles index iid */
 577+ public IndexId getPrefixTitles() {
 578+ return get(dbname+".prefix_titles");
 579+ }
564580
 581+ /** Get language code for this db, e.g. "en" */
 582+ public String getLangCode(){
 583+ if(langCode == null)
 584+ langCode = GlobalConfiguration.getInstance().getLanguage(dbname);
 585+ return langCode;
 586+ }
 587+
565588
566589 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -186,10 +186,12 @@
187187 }
188188 }
189189 // add the link analysis to indexers
190 - if(!types.contains("link_analysis"))
191 - database.get(dbname).put("link_analysis",new Hashtable<String,String>());
 190+ if(!types.contains("links"))
 191+ database.get(dbname).put("links",new Hashtable<String,String>());
192192 if(!types.contains("related"))
193193 database.get(dbname).put("related",new Hashtable<String,String>());
 194+ if(!types.contains("prefix_titles"))
 195+ database.get(dbname).put("prefix_titles",new Hashtable<String,String>());
194196 }
195197 // expand logical index names on searchers
196198 for(String host : search.keySet()){
@@ -232,7 +234,7 @@
233235 } else if(typeid.matches("nspart[1-9][0-9]*")){
234236 type = "nssplit";
235237 dbrole = dbname + "." + typeid;
236 - } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related") || typeid.equals("prefix")){
 238+ } else if(typeid.equals("spell") || typeid.equals("links") || typeid.equals("related") || typeid.equals("prefix") || typeid.equals("prefix_titles")){
237239 type = typeid;
238240 dbrole = dbname + "." + typeid;
239241 } else
@@ -252,7 +254,7 @@
253255 }
254256 boolean searched = (getSearchHosts(dbrole).size() != 0);
255257 if(!searched && !(typeid.equals("mainsplit") || typeid.equals("split")
256 - || typeid.equals("nssplit") || typeid.equals("link_analysis") || typeid.equals("related"))){
 258+ || typeid.equals("nssplit") || typeid.equals("links") || typeid.equals("related") || typeid.equals("prefix_titles"))){
257259 if(verbose)
258260 System.out.println("WARNING: in Global Configuration: index "+dbrole+" is not searched by any host.");
259261 }
@@ -519,7 +521,7 @@
520522 } else if(typeid.matches("nspart[1-9][0-9]*")){
521523 type = "nssplit";
522524 dbrole = dbname + "." + typeid;
523 - } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related") || typeid.equals("prefix")){
 525+ } else if(typeid.equals("spell") || typeid.equals("links") || typeid.equals("related") || typeid.equals("prefix") || typeid.equals("prefix_titles")){
524526 type = typeid;
525527 dbrole = dbname + "." + typeid;
526528 } else
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -5,6 +5,7 @@
66 import java.net.URI;
77 import java.text.MessageFormat;
88 import java.util.ArrayList;
 9+import java.util.Collection;
910 import java.util.HashMap;
1011 import java.util.HashSet;
1112 import java.util.Hashtable;
@@ -31,14 +32,19 @@
3233 import org.wikimedia.lsearch.analyzers.WikiQueryParser;
3334 import org.wikimedia.lsearch.beans.ResultSet;
3435 import org.wikimedia.lsearch.beans.SearchResults;
 36+import org.wikimedia.lsearch.beans.Title;
3537 import org.wikimedia.lsearch.config.GlobalConfiguration;
3638 import org.wikimedia.lsearch.config.IndexId;
3739 import org.wikimedia.lsearch.frontend.SearchDaemon;
3840 import org.wikimedia.lsearch.frontend.SearchServer;
3941 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
 42+import org.wikimedia.lsearch.ranks.Links;
4043 import org.wikimedia.lsearch.ranks.StringList;
 44+import org.wikimedia.lsearch.related.Related;
 45+import org.wikimedia.lsearch.related.RelatedTitle;
4146 import org.wikimedia.lsearch.spell.Suggest;
4247 import org.wikimedia.lsearch.spell.SuggestQuery;
 48+import org.wikimedia.lsearch.util.Localization;
4349 import org.wikimedia.lsearch.util.QueryStringMap;
4450
4551 /**
@@ -54,6 +60,7 @@
5561 protected final int maxlines = 1000;
5662 protected final int maxoffset = 10000;
5763 protected static GlobalConfiguration global = null;
 64+ protected static Hashtable<String,Hashtable<String,Integer>> dbNamespaces = new Hashtable<String,Hashtable<String,Integer>>();
5865
5966 public SearchEngine(){
6067 if(global == null)
@@ -102,17 +109,87 @@
103110 // TODO: return searchTitles(searchterm);
104111 } else if (what.equals("prefix")){
105112 return prefixSearch(iid, searchterm);
 113+ } else if (what.equals("related")){
 114+ int offset = 0, limit = 100; boolean exactCase = false;
 115+ if (query.containsKey("offset"))
 116+ offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
 117+ if (query.containsKey("limit"))
 118+ limit = Math.min(Integer.parseInt((String)query.get("limit")), maxlines);
 119+ return relatedSearch(iid, searchterm, offset, limit);
106120 } else {
107121 SearchResults res = new SearchResults();
108122 res.setErrorMsg("Unrecognized search type. Try one of: " +
109 - "search, explain, raw, rawexplain, prefix.");
 123+ "search, explain, raw, rawexplain, prefix, related.");
110124 log.warn("Unknown request type [" + what + "].");
111125 return res;
112126 }
113127 return null;
114128 }
115129
116 - private SearchResults prefixSearch(IndexId iid, String searchterm) {
 130+ /** Convert User:Rainman into 2:Rainman */
 131+ protected String getKey(String title, IndexId iid){
 132+ int colon = title.indexOf(':');
 133+ if(colon != -1 && colon != title.length()-1){
 134+ String ns = title.substring(0,colon);
 135+ Integer inx = dbNamespaces.get(iid.getDBname()).get(ns.toLowerCase());
 136+ if(inx != null){
 137+ return inx +":"+ title.substring(colon+1);
 138+ }
 139+ }
 140+
 141+ return "0:" + title;
 142+ }
 143+
 144+ protected SearchResults relatedSearch(IndexId iid, String searchterm, int offset, int limit) {
 145+ readLocalization(iid);
 146+ IndexId rel = iid.getRelated();
 147+ IndexId lin = iid.getLinks();
 148+ SearcherCache cache = SearcherCache.getInstance();
 149+ SearchResults res = new SearchResults();
 150+ try {
 151+ IndexSearcherMul searcher = cache.getLocalSearcher(rel);
 152+ IndexReader reader = searcher.getIndexReader();
 153+ String key = getKey(searchterm,iid);
 154+ TermDocs td = reader.termDocs(new Term("key",key));
 155+ if(td.next()){
 156+ ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection());
 157+ res.setNumHits(col.size());
 158+ res.setSuccess(true);
 159+ // TODO: this is extremely slow
 160+ Links links = Links.openForRead(lin,lin.getSearchPath());
 161+ for(int i=offset;i<offset+limit && i<col.size();i++){
 162+ RelatedTitle rt = col.get(i);
 163+ Title t = rt.getRelated();
 164+ ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle());
 165+ rs.addContext(links.getContext(t.getKey(),key));
 166+ res.addResult(rs);
 167+ }
 168+ } else{
 169+ res.setSuccess(true);
 170+ res.setNumHits(0);
 171+ }
 172+ } catch (IOException e) {
 173+ e.printStackTrace();
 174+ log.error("I/O error in relatedSearch on "+rel+" : "+e.getMessage());
 175+ res.setErrorMsg("I/O Error processing index for "+rel);
 176+ }
 177+ return res;
 178+ }
 179+
 180+ protected void readLocalization(IndexId iid){
 181+ if(!dbNamespaces.containsKey(iid.getDBname())){
 182+ synchronized(dbNamespaces){
 183+ HashMap<String,Integer> m = Localization.getLocalizedNamespaces(iid.getLangCode(),iid.getDBname());
 184+ Hashtable<String,Integer> map = new Hashtable<String,Integer>();
 185+ if(m != null)
 186+ map.putAll(m);
 187+ dbNamespaces.put(iid.getDBname(),map);
 188+ }
 189+ }
 190+ }
 191+
 192+ protected SearchResults prefixSearch(IndexId iid, String searchterm) {
 193+ readLocalization(iid);
117194 IndexId pre = iid.getPrefix();
118195 SearcherCache cache = SearcherCache.getInstance();
119196 SearchResults res = new SearchResults();
@@ -144,7 +221,8 @@
145222 }
146223 } catch (IOException e) {
147224 // res.setErrorMsg("Internal error during prefix search: "+e.getMessage());
148 - log.error("Internal error in SearchEngine::prefixSearch : "+e.getMessage());
 225+ log.error("Internal error in prefixSearch on "+pre+" : "+e.getMessage());
 226+ res.setErrorMsg("I/O error on index "+pre);
149227 }
150228 return res;
151229 }
@@ -166,9 +244,10 @@
167245 localfilter = null;
168246 if(localfilter != null)
169247 log.info("Using local filter: "+localfilter);
170 - Hits hits = searcher.search(q,localfilter);
 248+ TopDocs hits = searcher.search(q,localfilter,offset+limit);
171249 return makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
172250 } catch (IOException e) {
 251+ e.printStackTrace();
173252 SearchResults res = new SearchResults();
174253 res.setErrorMsg("Internal error in SearchEngine: "+e.getMessage());
175254 log.error("Internal error in SearchEngine while trying to search main part: "+e.getMessage());
@@ -186,7 +265,7 @@
187266 if(nsDefault == null || nsDefault.cardinality() == 0)
188267 nsDefault = new NamespaceFilter("0"); // default to main namespace
189268 FieldBuilder.Case dCase = exactCase? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
190 - FieldBuilder.BuilderSet bs = new FieldBuilder(global.getLanguage(iid.getDBname()),dCase).getBuilder(dCase);
 269+ FieldBuilder.BuilderSet bs = new FieldBuilder(iid,dCase).getBuilder(dCase);
191270 ArrayList<String> stopWords = null;
192271 try{
193272 stopWords = StopWords.getCached(iid);
@@ -354,7 +433,8 @@
355434
356435 /** Our scores can span several orders of magnitude, transform them to be more relevant to the user */
357436 public float transformScore(double score){
358 - return (float) (Math.log10(1+score*99)/2);
 437+ //return (float) (Math.log10(1+score*99)/2);
 438+ return (float) score;
359439 }
360440
361441 protected SearchResults makeSearchResults(SearchableMul s, TopDocs hits, int offset, int limit, IndexId iid, String searchterm, Query q, long searchStart, boolean explain) throws IOException{
@@ -375,14 +455,15 @@
376456 // fetch documents
377457 Document[] docs = s.docs(docids);
378458 int j=0;
379 - float maxScore = hits.getMaxScore();
 459+ //float maxScore = hits.getMaxScore();
 460+ float maxScore = 1;
380461 for(Document doc : docs){
381462 String namespace = doc.get("namespace");
382463 String title = doc.get("title");
383464 float score = transformScore(scores[j]/maxScore);
384465 ResultSet rs = new ResultSet(score,namespace,title);
385466 if(explain)
386 - rs.setExplanation(((WikiSearcher)s).explain(q,docids[j]));
 467+ rs.setExplanation(((Searcher)s).explain(q,docids[j]));
387468 res.addResult(rs);
388469 j++;
389470 }
@@ -410,8 +491,8 @@
411492 Document[] docs = s.docs(docids);
412493 int j=0;
413494 float maxScore = 1;
414 - if(numhits>0)
415 - maxScore = hits.score(0);
 495+ //if(numhits>0)
 496+ // maxScore = hits.score(0);
416497 for(Document doc : docs){
417498 String namespace = doc.get("namespace");
418499 String title = doc.get("title");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankValueSourceQuery.java
@@ -0,0 +1,178 @@
 2+package org.wikimedia.lsearch.search;
 3+
 4+import java.io.IOException;
 5+import java.util.Set;
 6+
 7+import org.apache.lucene.index.IndexReader;
 8+import org.apache.lucene.search.ComplexExplanation;
 9+import org.apache.lucene.search.Explanation;
 10+import org.apache.lucene.search.Query;
 11+import org.apache.lucene.search.Scorer;
 12+import org.apache.lucene.search.Searcher;
 13+import org.apache.lucene.search.Similarity;
 14+import org.apache.lucene.search.Weight;
 15+import org.apache.lucene.search.function.DocValues;
 16+import org.apache.lucene.search.function.ValueSource;
 17+import org.apache.lucene.search.function.ValueSourceQuery;
 18+import org.apache.lucene.util.ToStringUtils;
 19+
 20+public class RankValueSourceQuery extends ValueSourceQuery {
 21+ protected ValueSource valSrc;
 22+ /**
 23+ * Create a value source query
 24+ * @param valSrc provides the values defines the function to be used for scoring
 25+ */
 26+ public RankValueSourceQuery(ValueSource valSrc) {
 27+ super(valSrc);
 28+ this.valSrc = valSrc;
 29+ }
 30+
 31+ /*(non-Javadoc) @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) */
 32+ public Query rewrite(IndexReader reader) throws IOException {
 33+ return this;
 34+ }
 35+
 36+ /*(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(java.util.Set) */
 37+ public void extractTerms(Set terms) {
 38+ // no terms involved here
 39+ }
 40+
 41+ private class ValueSourceWeight implements Weight {
 42+ Searcher searcher;
 43+ float queryNorm;
 44+ float queryWeight;
 45+
 46+ public ValueSourceWeight(Searcher searcher) {
 47+ this.searcher = searcher;
 48+ }
 49+
 50+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#getQuery() */
 51+ public Query getQuery() {
 52+ return RankValueSourceQuery.this;
 53+ }
 54+
 55+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() */
 56+ public float getValue() {
 57+ return queryWeight;
 58+ }
 59+
 60+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() */
 61+ public float sumOfSquaredWeights() throws IOException {
 62+ queryWeight = getBoost();
 63+ return queryWeight * queryWeight;
 64+ }
 65+
 66+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) */
 67+ public void normalize(float norm) {
 68+ this.queryNorm = 1;
 69+ queryWeight *= this.queryNorm;
 70+ }
 71+
 72+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#scorer(org.apache.lucene.index.IndexReader) */
 73+ public Scorer scorer(IndexReader reader) throws IOException {
 74+ return new ValueSourceScorer(getSimilarity(searcher), reader, this);
 75+ }
 76+
 77+ /*(non-Javadoc) @see org.apache.lucene.search.Weight#explain(org.apache.lucene.index.IndexReader, int) */
 78+ public Explanation explain(IndexReader reader, int doc) throws IOException {
 79+ return scorer(reader).explain(doc);
 80+ }
 81+ }
 82+
 83+ /**
 84+ * A scorer that (simply) matches all documents, and scores each document with
 85+ * the value of the value soure in effect. As an example, if the value source
 86+ * is a (cached) field source, then value of that field in that document will
 87+ * be used. (assuming field is indexed for this doc, with a single token.)
 88+ */
 89+ private class ValueSourceScorer extends Scorer {
 90+ private final IndexReader reader;
 91+ private final ValueSourceWeight weight;
 92+ private final int maxDoc;
 93+ private final float qWeight;
 94+ private int doc=-1;
 95+ private final DocValues vals;
 96+
 97+ // constructor
 98+ private ValueSourceScorer(Similarity similarity, IndexReader reader, ValueSourceWeight w) throws IOException {
 99+ super(similarity);
 100+ this.weight = w;
 101+ this.qWeight = w.getValue();
 102+ this.reader = reader;
 103+ this.maxDoc = reader.maxDoc();
 104+ // this is when/where the values are first created.
 105+ vals = valSrc.getValues(reader);
 106+ }
 107+
 108+ /*(non-Javadoc) @see org.apache.lucene.search.Scorer#next() */
 109+ public boolean next() throws IOException {
 110+ for(;;) {
 111+ ++doc;
 112+ if (doc>=maxDoc) {
 113+ return false;
 114+ }
 115+ if (reader.isDeleted(doc)) {
 116+ continue;
 117+ }
 118+ return true;
 119+ }
 120+ }
 121+
 122+ /*(non-Javadoc) @see org.apache.lucene.search.Scorer#doc()
 123+ */
 124+ public int doc() {
 125+ return doc;
 126+ }
 127+
 128+ /*(non-Javadoc) @see org.apache.lucene.search.Scorer#score() */
 129+ public float score() throws IOException {
 130+ return qWeight * vals.floatVal(doc);
 131+ }
 132+
 133+ /*(non-Javadoc) @see org.apache.lucene.search.Scorer#skipTo(int) */
 134+ public boolean skipTo(int target) throws IOException {
 135+ doc=target-1;
 136+ return next();
 137+ }
 138+
 139+ /*(non-Javadoc) @see org.apache.lucene.search.Scorer#explain(int) */
 140+ public Explanation explain(int doc) throws IOException {
 141+ float sc = qWeight * vals.floatVal(doc);
 142+
 143+ Explanation result = new ComplexExplanation(
 144+ true, sc, RankValueSourceQuery.this.toString() + ", product of:");
 145+
 146+ result.addDetail(vals.explain(doc));
 147+ result.addDetail(new Explanation(getBoost(), "boost"));
 148+ result.addDetail(new Explanation(weight.queryNorm,"queryNorm"));
 149+ return result;
 150+ }
 151+ }
 152+
 153+ /*(non-Javadoc) @see org.apache.lucene.search.Query#createWeight(org.apache.lucene.search.Searcher) */
 154+ protected Weight createWeight(Searcher searcher) {
 155+ return new RankValueSourceQuery.ValueSourceWeight(searcher);
 156+ }
 157+
 158+ /* (non-Javadoc) @see org.apache.lucene.search.Query#toString(java.lang.String) */
 159+ public String toString(String field) {
 160+ return valSrc.toString() + ToStringUtils.boost(getBoost());
 161+ }
 162+
 163+ /** Returns true if <code>o</code> is equal to this. */
 164+ public boolean equals(Object o) {
 165+ if (getClass() != o.getClass()) {
 166+ return false;
 167+ }
 168+ RankValueSourceQuery other = (RankValueSourceQuery)o;
 169+ return this.getBoost() == other.getBoost()
 170+ && this.valSrc.equals(other.valSrc);
 171+ }
 172+
 173+ /** Returns a hash code value for this object. */
 174+ public int hashCode() {
 175+ return (getClass().hashCode() + valSrc.hashCode()) ^ Float.floatToIntBits(getBoost());
 176+ }
 177+
 178+
 179+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankDocValues.java
@@ -0,0 +1,36 @@
 2+package org.wikimedia.lsearch.search;
 3+
 4+import java.io.IOException;
 5+
 6+import org.apache.lucene.index.CorruptIndexException;
 7+import org.apache.lucene.index.IndexReader;
 8+import org.apache.lucene.index.TermDocs;
 9+import org.apache.lucene.search.function.DocValues;
 10+
 11+public class RankDocValues extends DocValues {
 12+ IndexReader reader;
 13+
 14+ public RankDocValues(IndexReader reader){
 15+ super(reader.maxDoc());
 16+ this.reader = reader;
 17+ }
 18+
 19+ protected int getValue(int doc){
 20+ try{
 21+ return Integer.parseInt(reader.document(doc).get("rank"));
 22+ } catch(IOException e){
 23+ return 0;
 24+ }
 25+ }
 26+
 27+ @Override
 28+ public float floatVal(int doc) {
 29+ return getValue(doc);
 30+ }
 31+
 32+ @Override
 33+ public String toString(int doc) {
 34+ return "rank: "+getValue(doc);
 35+ }
 36+
 37+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankValueSource.java
@@ -0,0 +1,34 @@
 2+package org.wikimedia.lsearch.search;
 3+
 4+import java.io.IOException;
 5+
 6+import org.apache.lucene.index.IndexReader;
 7+import org.apache.lucene.search.function.DocValues;
 8+import org.apache.lucene.search.function.ValueSource;
 9+
 10+public class RankValueSource extends ValueSource {
 11+
 12+ @Override
 13+ public String description() {
 14+ return "";
 15+ }
 16+
 17+ @Override
 18+ public boolean equals(Object o) {
 19+ if(o == this)
 20+ return true;
 21+ else
 22+ return false;
 23+ }
 24+
 25+ @Override
 26+ public DocValues getValues(IndexReader reader) throws IOException {
 27+ return new RankDocValues(reader);
 28+ }
 29+
 30+ @Override
 31+ public int hashCode() {
 32+ return 0;
 33+ }
 34+
 35+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java
@@ -27,6 +27,7 @@
2828 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
2929 import org.wikimedia.lsearch.interoperability.RMIServer;
3030 import org.wikimedia.lsearch.util.Command;
 31+import org.wikimedia.lsearch.util.FSUtils;
3132
3233
3334 /**
@@ -179,16 +180,16 @@
180181 try{
181182 // if local, use cp -lr instead of rsync
182183 if(global.isLocalhost(iid.getIndexHost())){
183 - Command.exec("/bin/cp -lr "+iid.getSnapshotPath()+sep+li.timestamp+" "+iid.getUpdatePath());
 184+ FSUtils.createHardLinkRecursive(
 185+ iid.getSnapshotPath()+sep+li.timestamp,
 186+ updatepath);
184187 } else{
185188 File ind = new File(iid.getCanonicalSearchPath());
186189
187190 if(ind.exists()){ // prepare a local hard-linked copy of index
188 - ind = ind.getCanonicalFile();
189 - for(File f: ind.listFiles()){
190 - // a cp -lr command for each file in the index
191 - Command.exec("/bin/cp -lr "+ind.getCanonicalPath()+sep+f.getName()+" "+updatepath+sep+f.getName());
192 - }
 191+ FSUtils.createHardLinkRecursive(
 192+ ind.getCanonicalPath(),
 193+ updatepath);
193194 }
194195 long startTime = System.currentTimeMillis();
195196 // rsync
@@ -208,8 +209,8 @@
209210 SearcherCache.SearcherPool pool = new SearcherCache.SearcherPool(iid,li.path,cache.getSearchPoolSize());
210211
211212 // refresh the symlink
212 - Command.exec("/bin/rm -rf "+iid.getSearchPath());
213 - Command.exec("/bin/ln -fs "+updatepath+" "+iid.getSearchPath());
 213+ FSUtils.delete(iid.getSearchPath());
 214+ FSUtils.createSymLink(updatepath,iid.getSearchPath());
214215
215216 // update registry, cache, rmi object
216217 registry.refreshUpdates(iid);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java
@@ -65,7 +65,7 @@
6666 /** Warmup index using some number of simple searches */
6767 protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) {
6868 String lang = global.getLanguage(iid.getDBname());
69 - FieldBuilder.BuilderSet b = new FieldBuilder(lang).getBuilder();
 69+ FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder();
7070 WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null);
7171 Terms terms = getTermsForLang(lang);
7272
@@ -122,7 +122,7 @@
123123 public static void simpleWarmup(IndexSearcherMul is, IndexId iid){
124124 try{
125125 String lang = global.getLanguage(iid.getDBname());
126 - FieldBuilder.BuilderSet b = new FieldBuilder(lang).getBuilder();
 126+ FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder();
127127 WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null);
128128 Query q = parser.parseFourPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
129129 is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0")));
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -27,6 +27,7 @@
2828 import org.apache.lucene.store.Directory;
2929 import org.apache.lucene.store.FSDirectory;
3030 import org.wikimedia.lsearch.analyzers.Analyzers;
 31+import org.wikimedia.lsearch.analyzers.ContextAnalyzer;
3132 import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
3233 import org.wikimedia.lsearch.analyzers.FieldBuilder;
3334 import org.wikimedia.lsearch.analyzers.FieldNameFactory;
@@ -41,6 +42,7 @@
4243 import org.wikimedia.lsearch.config.GlobalConfiguration;
4344 import org.wikimedia.lsearch.config.IndexId;
4445 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
 46+import org.wikimedia.lsearch.ranks.Links;
4547 import org.wikimedia.lsearch.related.RelatedTitle;
4648 import org.wikimedia.lsearch.spell.api.SpellCheckIndexer;
4749 import org.wikimedia.lsearch.util.Localization;
@@ -169,7 +171,15 @@
170172 writer.setUseCompoundFile(true);
171173 writer.setMaxFieldLength(MAX_FIELD_LENGTH);
172174 FieldBuilder.Case dCase = (exactCase)? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
173 - FieldBuilder builder = new FieldBuilder(langCode,dCase);
 175+ FieldBuilder builder = new FieldBuilder(iid,dCase);
 176+ // TODO: fixme
 177+ Links links = null;
 178+ try {
 179+ links = Links.openForRead(iid,iid.getImportPath());
 180+ } catch (IOException e1) {
 181+ // TODO Auto-generated catch block
 182+ e1.printStackTrace();
 183+ }
174184
175185 for(IndexUpdateRecord rec : records){
176186 if(rec.doAdd()){
@@ -178,7 +188,7 @@
179189 if(!checkPreconditions(rec))
180190 continue; // article shouldn't be added for some reason
181191 IndexReportCard card = getReportCard(rec);
182 - Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),builder,iid);
 192+ Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),builder,iid,links);
183193 Document doc = (Document) ret[0];
184194 Analyzer analyzer = (Analyzer) ret[1];
185195 try {
@@ -400,9 +410,8 @@
401411 * @param languageAnalyzer
402412 * @return array { document, analyzer }
403413 */
404 - public static Object[] makeDocumentAndAnalyzer(Article article, FieldBuilder builder, IndexId iid){
 414+ public static Object[] makeDocumentAndAnalyzer(Article article, FieldBuilder builder, IndexId iid, Links links){
405415 PerFieldAnalyzerWrapper perFieldAnalyzer = null;
406 - WikiTokenizer tokenizer = null;
407416 Document doc = new Document();
408417
409418 // tranform record so that unnecessary stuff is deleted, e.g. some redirects
@@ -463,8 +472,10 @@
464473 doc.add(contents);
465474
466475 // related articles
467 - p = makeRelated(doc,fields.related(),article,1);
 476+ p = makeRelated(doc,fields.related(),article,1,fields.context());
468477
 478+ //makeContextField(doc,fields.context(),fields.related());
 479+
469480 // anchors
470481 // makeKeywordField(doc,fields.anchor(),rankBoost);
471482
@@ -479,7 +490,7 @@
480491 }
481492 // make analyzer
482493 String text = article.getContents();
483 - Object[] ret = Analyzers.getIndexerAnalyzer(text,builder,article.getRedirectKeywords(),article.getAnchorText(),article.getRelated(),p);
 494+ Object[] ret = Analyzers.getIndexerAnalyzer(text,builder,article.getRedirectKeywords(),article.getAnchorText(),article.getRelated(),p,article.makeTitle(),links);
484495 perFieldAnalyzer = (PerFieldAnalyzerWrapper) ret[0];
485496
486497
@@ -487,7 +498,7 @@
488499 }
489500
490501 /** Returns partioning of related titles, or null if there aren't any */
491 - protected static int[] makeRelated(Document doc, String prefix, Article article, float boost) {
 502+ protected static int[] makeRelated(Document doc, String prefix, Article article, float boost, String context) {
492503 ArrayList<RelatedTitle> rel = article.getRelated();
493504 if(rel == null || rel.size()==0)
494505 return null;
@@ -501,14 +512,32 @@
502513 for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){
503514 Field relfield = new Field(prefix+i, "",
504515 Field.Store.NO, Field.Index.TOKENIZED);
505 - relfield.setBoost(boost*(float)MathFunc.avg(scores,p[i-1],p[i]));
 516+ float fb = boost*(float)MathFunc.avg(scores,p[i-1],p[i]);
 517+ relfield.setBoost(fb);
506518 doc.add(relfield);
 519+ if(i <= ContextAnalyzer.CONTEXT_GROUPS){
 520+ Field confield = new Field(context+i, "",
 521+ Field.Store.NO, Field.Index.TOKENIZED);
 522+ confield.setBoost(fb); // use same boost as related field
 523+ doc.add(confield);
 524+ }
507525 }
508526
509527 return p;
510528 }
511529
512 - /** Make a multiple keyword field, e.g. redirect1, redirect2, redirect3 ... */
 530+ /** Make a multiple context field ... */
 531+ protected static void makeContextField(Document doc, String prefix, String related) {
 532+ for(int i=1;i<=ContextAnalyzer.CONTEXT_GROUPS;i++){
 533+ Field keyfield = new Field(prefix+i, "",
 534+ Field.Store.NO, Field.Index.TOKENIZED);
 535+ keyfield.setBoost(doc.getField(related+i).getBoost()); // use same boost as related field
 536+ doc.add(keyfield);
 537+ }
 538+
 539+ }
 540+
 541+ /** Make a multiple keyword field, e.g. keyword1, keyword2, keyword3 ... */
513542 protected static void makeKeywordField(Document doc, String prefix, float boost) {
514543 for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){
515544 Field keyfield = new Field(prefix+i, "",
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiSimilarity.java
@@ -39,7 +39,7 @@
4040 float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
4141 //log.debug("Length-norm: "+f+", numtokens: "+numTokens);
4242 return f;
43 - } else if(fieldName.startsWith("redirect") || fieldName.startsWith("keyword") || fieldName.startsWith("related") || fieldName.startsWith("anchor")){
 43+ } else if(fieldName.startsWith("redirect") || fieldName.startsWith("keyword") || fieldName.startsWith("related") || fieldName.startsWith("anchor") || fieldName.startsWith("context")){
4444 return 1;
4545 } else
4646 return super.lengthNorm(fieldName,numTokens);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java
@@ -38,6 +38,7 @@
3939 import org.wikimedia.lsearch.config.IndexRegistry;
4040 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
4141 import org.wikimedia.lsearch.util.Command;
 42+import org.wikimedia.lsearch.util.FSUtils;
4243
4344 /**
4445 * Indexer.
@@ -235,20 +236,6 @@
236237 }
237238 }
238239
239 - protected static void deleteDirRecursive(File file){
240 - if(!file.exists())
241 - return;
242 - else if(file.isDirectory()){
243 - File[] files = file.listFiles();
244 - for(File f: files)
245 - deleteDirRecursive(f);
246 - file.delete();
247 - log.debug("Deleted old snapshot at "+file);
248 - } else{
249 - file.delete();
250 - }
251 - }
252 -
253240 /**
254241 * Make a snapshot of all changed indexes
255242 *
@@ -296,20 +283,27 @@
297284 File[] files = spd.listFiles();
298285 for(File f: files){
299286 if(!f.getAbsolutePath().equals(li.path)) // leave the last snapshot
300 - deleteDirRecursive(f);
 287+ FSUtils.deleteRecursive(f);
301288 }
302289 }
303290 new File(snapshot).mkdirs();
 291+ try {
 292+ FSUtils.createHardLinkRecursive(indexPath,snapshot);
 293+ } catch (IOException e) {
 294+ log.error("Error making snapshot "+snapshot+": "+e.getMessage());
 295+ return;
 296+ }
 297+ /*
304298 File ind =new File(indexPath);
305299 for(File f: ind.listFiles()){
306 - // use a cp -lr command for each file in the index
 300+ // hardlink the snapshot
307301 try {
308302 Command.exec("/bin/cp -lr "+indexPath+sep+f.getName()+" "+snapshot+sep+f.getName());
309303 } catch (IOException e) {
310304 log.error("Error making snapshot "+snapshot+": "+e.getMessage());
311305 continue;
312306 }
313 - }
 307+ } */
314308 log.info("Made snapshot "+snapshot);
315309 }
316310
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/HighlightDaemon.java
@@ -128,7 +128,7 @@
129129 FieldBuilder.Case dCase = exactCase? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
130130 String lang = global.getLanguage(dbname);
131131 Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase);
132 - FieldBuilder.BuilderSet bs = new FieldBuilder(lang,dCase).getBuilder(dCase);
 132+ FieldBuilder.BuilderSet bs = new FieldBuilder(iid,dCase).getBuilder(dCase);
133133 WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),
134134 new NamespaceFilter("0"),analyzer,bs,WikiQueryParser.NamespacePolicy.IGNORE,null);
135135 Query q = parser.parseFourPass(query,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
@@ -139,7 +139,7 @@
140140
141141 for(Article ar : articles){
142142 log.debug("Sending highlighted text for "+ar);
143 - String clean = new CleanupParser(ar.getContents(),lang).parse();
 143+ String clean = new CleanupParser(ar.getContents(),iid).parse();
144144 TokenStream tokens = analyzer.tokenStream("contents",clean);
145145 out.println("HIGHLIGHTING "+ar.getNamespace()+" "+ar.getTitle());
146146 String[] highlighted = highlighter.getBestFragments(tokens,clean,segments);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/CleanupParser.java
@@ -3,6 +3,7 @@
44 import java.util.HashSet;
55 import java.util.Hashtable;
66
 7+import org.wikimedia.lsearch.config.IndexId;
78 import org.wikimedia.lsearch.util.Localization;
89
910 /**
@@ -34,6 +35,7 @@
3536
3637 /** language code */
3738 private String language;
 39+ private IndexId iid;
3840 /** language code -> set (image namespace names) */
3941 private static Hashtable<String,HashSet<String>> imageLocalized = new Hashtable<String,HashSet<String>>();
4042 /** language code -> set (category namespace names) */
@@ -47,10 +49,11 @@
4850
4951 enum FetchState { WORD, CATEGORY, INTERWIKI, KEYWORD };
5052
51 - public CleanupParser(String text, String lang){
 53+ public CleanupParser(String text, IndexId iid){
5254 this.text = text.toCharArray();
5355 this.textString = text;
54 - this.language = lang;
 56+ this.iid = iid;
 57+ this.language = iid.getLangCode();
5558 textLength = text.length();
5659 out = new char[textLength];
5760 }
@@ -409,7 +412,7 @@
410413 else if(language!=null && language.length()!=0){
411414 HashSet<String> loc = imageLocalized.get(language);
412415 if(loc == null){
413 - loc = Localization.getLocalizedImage(language);
 416+ loc = Localization.getLocalizedImage(language,iid.getDBname());
414417 imageLocalized.put(language,loc);
415418 }
416419 if(loc.contains(prefix))
@@ -426,7 +429,7 @@
427430 else if(language!=null && language.length()!=0){
428431 HashSet<String> loc = categoryLocalized.get(language);
429432 if(loc == null){
430 - loc = Localization.getLocalizedCategory(language);
 433+ loc = Localization.getLocalizedCategory(language,iid.getDBname());
431434 categoryLocalized.put(language,loc);
432435 }
433436 if(loc.contains(prefix))
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
@@ -83,7 +83,7 @@
8484 Iterator it = info.Namespaces.orderedEntries();
8585 while(it.hasNext()){
8686 Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
87 - Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);
 87+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),iid.getDBname());
8888 }
8989 }
9090
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -16,11 +16,17 @@
1717 import org.apache.lucene.queryParser.ParseException;
1818 import org.apache.lucene.search.BooleanClause;
1919 import org.apache.lucene.search.BooleanQuery;
 20+import org.apache.lucene.search.CustomBoostQuery;
 21+import org.apache.lucene.search.Explanation;
2022 import org.apache.lucene.search.PhraseQuery;
2123 import org.apache.lucene.search.Query;
2224 import org.apache.lucene.search.TermQuery;
2325 import org.apache.lucene.search.WildcardQuery;
2426 import org.apache.lucene.search.BooleanClause.Occur;
 27+import org.apache.lucene.search.function.CustomScoreQuery;
 28+import org.apache.lucene.search.function.FieldScoreQuery;
 29+import org.apache.lucene.search.function.ValueSource;
 30+import org.apache.lucene.search.function.ValueSourceQuery;
2531 import org.apache.lucene.search.spans.SpanNearQuery;
2632 import org.apache.lucene.search.spans.SpanQuery;
2733 import org.apache.lucene.search.spans.SpanTermQuery;
@@ -28,6 +34,8 @@
2935 import org.wikimedia.lsearch.config.GlobalConfiguration;
3036 import org.wikimedia.lsearch.index.WikiIndexModifier;
3137 import org.wikimedia.lsearch.search.NamespaceFilter;
 38+import org.wikimedia.lsearch.search.RankValueSource;
 39+import org.wikimedia.lsearch.search.RankValueSourceQuery;
3240 import org.wikimedia.lsearch.util.UnicodeDecomposer;
3341
3442 /**
@@ -88,12 +96,13 @@
8997 public static float KEYWORD_BOOST = 0.02f;
9098 public static float CONTENTS_BOOST = 0.2f;
9199
92 - public static int ADDITIONAL_PHRASE_SLOP_CONTENTS = 20;
 100+ public static int ADDITIONAL_PHRASE_SLOP_CONTENTS = 5000;
93101 public static float ADDITIONAL_BOOST_CONTENTS = 0.5f;
94 - public static int ADDITIONAL_PHRASE_SLOP_TITLE = 1;
 102+ public static int ADDITIONAL_PHRASE_SLOP_TITLE = 0;
95103 public static float ADDITIONAL_BOOST_TITLE = 0.5f;
96 - public static int ADDITIONAL_PHRASE_SLOP_RELATED = 10;
97 - public static float ADDITIONAL_BOOST_RELATED = 0.04f;
 104+ public static int ADDITIONAL_PHRASE_SLOP_RELATED = 0;
 105+ public static float ADDITIONAL_BOOST_RELATED = 0.2f;
 106+ public static float ADDITIONAL_BOOST_CONTEXT = 0.05f;
98107
99108 public static float WHOLE_TITLE_BOOST = 8f;
100109 public static float EXACT_CONTENTS_BOOST = 1f;
@@ -1422,11 +1431,30 @@
14231432 pq.setSlop(slop);
14241433 return pq;
14251434 }
1426 -
 1435+
14271436 /** Make phrase queries for additional scores */
14281437 public Query makePhraseQueries(ArrayList<String> words, String field, int slop, float boost){
14291438 if(words.size() <= 1)
14301439 return null;
 1440+ else{
 1441+ PhraseQuery pq = new PhraseQuery();
 1442+ for(String w : words){
 1443+ if(!stopWords.contains(w))
 1444+ pq.add(new Term(field,w));
 1445+ }
 1446+ pq.setSlop(slop);
 1447+ pq.setBoost(boost);
 1448+ return pq;
 1449+ }
 1450+
 1451+ }
 1452+
 1453+
 1454+ /** Make phrase queries for additional scores */
 1455+ @Deprecated
 1456+ public Query makePhraseQueriesOld(ArrayList<String> words, String field, int slop, float boost){
 1457+ if(words.size() <= 1)
 1458+ return null;
14311459 else if(words.size() == 2){
14321460 PhraseQuery pq = makePhrase(words,field,slop);
14331461 pq.setBoost(boost);
@@ -1550,26 +1578,73 @@
15511579 // skip last related group
15521580 Query[] pqr = new Query[RelatedAnalyzer.RELATED_GROUPS-1];
15531581 for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){
1554 - pqr[i-1] = makePhraseQueries(words,"related"+i,ADDITIONAL_PHRASE_SLOP_RELATED,ADDITIONAL_BOOST_RELATED);
 1582+ pqr[i-1] = makePhraseQueries(words,fields.related()+i,ADDITIONAL_PHRASE_SLOP_RELATED,ADDITIONAL_BOOST_RELATED);
15551583 }
15561584 Query[] wqr = new Query[RelatedAnalyzer.RELATED_GROUPS-1];
15571585 for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){
1558 - wqr[i-1] = makeWordQueries(words,"related"+i,ADDITIONAL_BOOST_RELATED / 4);
 1586+ wqr[i-1] = makeWordQueries(words,fields.related()+i,ADDITIONAL_BOOST_RELATED / 4);
15591587 }
 1588+ Query[] pqx = new Query[ContextAnalyzer.CONTEXT_GROUPS];
 1589+ // make context queries
 1590+ for(int i=1;i<=ContextAnalyzer.CONTEXT_GROUPS;i++){
 1591+ pqx[i-1] = makePhraseQueries(words,fields.context()+i,0,ADDITIONAL_BOOST_CONTEXT);
 1592+ }
15601593 if(wt==null && pqc == null && pqt == null && pqr[0] == null && wqr[0] == null)
15611594 return bq;
15621595 // build the final query
1563 - BooleanQuery finalQuery = new BooleanQuery(true);
 1596+ BooleanQuery coreQuery = new BooleanQuery(true);
15641597 BooleanQuery additional = new BooleanQuery(true);
 1598+ //BooleanQuery boostQuery = new BooleanQuery(true);
15651599
1566 - if(pqc != null)
1567 - additional.add(pqc,Occur.MUST);
 1600+ if(pqc != null){
 1601+ //additional.add(pqc,Occur.MUST);
 1602+ additional.add(new CustomScoreQuery(pqc, new RankValueSourceQuery(new RankValueSource())){
 1603+ public float customScore(int doc, float subQueryScore, float valSrcScore) {
 1604+ return (float) (subQueryScore * Math.log(Math.E+valSrcScore/15));
 1605+ }
 1606+ public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation valSrcExpl) {
 1607+ float valSrcScore = valSrcExpl==null ? 1 : valSrcExpl.getValue();
 1608+ Explanation exp = new Explanation( (float)Math.log(Math.E+valSrcScore/15) * subQueryExpl.getValue(), ": "+valSrcScore+" "+(float)Math.log(Math.E+valSrcScore/15)+"*"+subQueryExpl.getValue()+" custom score: product of:");
 1609+ exp.addDetail(subQueryExpl);
 1610+ if (valSrcExpl != null) {
 1611+ exp.addDetail(valSrcExpl);
 1612+ }
 1613+ return exp;
 1614+ }
 1615+ },Occur.MUST);
 1616+ }
15681617 if(pqt != null)
15691618 additional.add(pqt,Occur.SHOULD);
15701619 if(wt != null)
15711620 additional.add(wt,Occur.SHOULD);
1572 - if(wc != null)
1573 - additional.add(wc,Occur.SHOULD);
 1621+ if(wc != null){
 1622+ // additional.add(wc,Occur.SHOULD);
 1623+ BooleanQuery boostExact = new BooleanQuery();
 1624+ for(Query q : pqr){
 1625+ if(q != null)
 1626+ boostExact.add(q,Occur.SHOULD);
 1627+ }
 1628+ for(Query q : wqr){
 1629+ if(q != null)
 1630+ boostExact.add(q,Occur.SHOULD);
 1631+ }
 1632+ CustomBoostQuery cbq = new CustomBoostQuery(wc,boostExact);
 1633+ /*CustomScoreQuery csq = new CustomScoreQuery(cbq, new RankValueSourceQuery(new RankValueSource())) {
 1634+ public float customScore(int doc, float subQueryScore, float valSrcScore) {
 1635+ return (float) (subQueryScore * Math.log10(10+valSrcScore));
 1636+ }
 1637+ public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation valSrcExpl) {
 1638+ float valSrcScore = valSrcExpl==null ? 1 : valSrcExpl.getValue();
 1639+ Explanation exp = new Explanation( (float)Math.log10(10+valSrcScore) * subQueryExpl.getValue(), "custom score: product of:");
 1640+ exp.addDetail(subQueryExpl);
 1641+ if (valSrcExpl != null) {
 1642+ exp.addDetail(valSrcExpl);
 1643+ }
 1644+ return exp;
 1645+ }
 1646+ }; */
 1647+ additional.add(cbq,Occur.SHOULD);
 1648+ }
15741649 for(Query q : pqr){
15751650 if(q != null)
15761651 additional.add(q,Occur.SHOULD);
@@ -1578,16 +1653,21 @@
15791654 if(q != null)
15801655 additional.add(q,Occur.SHOULD);
15811656 }
 1657+ /*for(Query q : pqx){
 1658+ if(q != null)
 1659+ additional.add(q,Occur.SHOULD);
 1660+ } */
15821661
15831662 // anchors
15841663 //Query anchors = multiplySpans(nostem,0,fields.anchor(),ANCHOR_BOOST);
15851664
1586 - finalQuery.add(bq,Occur.MUST);
1587 - finalQuery.add(additional,Occur.SHOULD);
 1665+ coreQuery.add(bq,Occur.MUST);
 1666+ coreQuery.add(additional,Occur.SHOULD);
15881667 //if(anchors != null)
15891668 // finalQuery.add(anchors,Occur.SHOULD);
15901669
1591 - return finalQuery;
 1670+ return coreQuery;
 1671+ //return new CustomBoostQuery(coreQuery,boostQuery);
15921672
15931673 }
15941674
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java
@@ -15,9 +15,11 @@
1616 import org.apache.lucene.analysis.ru.RussianStemFilter;
1717 import org.apache.lucene.analysis.th.ThaiWordFilter;
1818 import org.apache.lucene.search.FieldSortedHitQueue;
 19+import org.wikimedia.lsearch.beans.Title;
1920 import org.wikimedia.lsearch.config.GlobalConfiguration;
2021 import org.wikimedia.lsearch.config.IndexId;
2122 import org.wikimedia.lsearch.index.WikiIndexModifier;
 23+import org.wikimedia.lsearch.ranks.Links;
2224 import org.wikimedia.lsearch.related.RelatedTitle;
2325 import org.wikimedia.lsearch.test.AliasPorterStemFilter;
2426
@@ -54,12 +56,13 @@
5557 * @param languageAnalyzer language filter class (e.g. PorterStemFilter)
5658 * @return {PerFieldAnalyzerWrapper,WikiTokenizer}
5759 */
58 - public static Object[] getIndexerAnalyzer(String text, FieldBuilder builder, ArrayList<String> redirects, ArrayList<String> anchors, ArrayList<RelatedTitle> related, int[] relatedPartition) {
 60+ public static Object[] getIndexerAnalyzer(String text, FieldBuilder builder, ArrayList<String> redirects, ArrayList<String> anchors,
 61+ ArrayList<RelatedTitle> related, int[] relatedPartition, Title title, Links links) {
5962 PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer());
6063 WikiTokenizer tokenizer = null;
6164 for(FieldBuilder.BuilderSet bs : builder.getBuilders()){
6265 tokenizer = addFieldsForIndexing(perFieldAnalyzer, text, bs.getFilters(), bs.getFields(),
63 - redirects, anchors, related, relatedPartition, bs.isExactCase(), bs.isAddKeywords());
 66+ redirects, anchors, related, relatedPartition, title, links, bs.isExactCase(), bs.isAddKeywords());
6467 }
6568 return new Object[] {perFieldAnalyzer,tokenizer};
6669 }
@@ -70,9 +73,9 @@
7174 */
7275 public static WikiTokenizer addFieldsForIndexing(PerFieldAnalyzerWrapper perFieldAnalyzer, String text,
7376 FilterFactory filters, FieldNameFactory fields, ArrayList<String> redirects, ArrayList<String> anchors,
74 - ArrayList<RelatedTitle> related, int[] relatedPartition, boolean exactCase, boolean addKeywords) {
 77+ ArrayList<RelatedTitle> related, int[] relatedPartition, Title title, Links links, boolean exactCase, boolean addKeywords) {
7578 // parse wiki-text to get categories
76 - WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase);
 79+ WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getIndexId(),exactCase);
7780 tokenizer.tokenize();
7881 ArrayList<String> categories = tokenizer.getCategories();
7982 HashMap<String,String> interwiki = tokenizer.getInterwikis();
@@ -106,6 +109,9 @@
107110 // related
108111 setRelatedAnalyzer(perFieldAnalyzer,fields.related(),
109112 new RelatedAnalyzer(related,relatedPartition,filters.getNoStemmerFilterFactory(),fields.related(),exactCase));
 113+ // context
 114+ setContextAnalyzer(perFieldAnalyzer,fields.context(),
 115+ new ContextAnalyzer(title,links,related,relatedPartition,filters.getNoStemmerFilterFactory(),fields.context(),exactCase));
110116 return tokenizer;
111117 }
112118
@@ -126,24 +132,24 @@
127133 perFieldAnalyzer.addAnalyzer(prefix+i,analyzer);
128134 }
129135 }
130 -
131 - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid, boolean exactCase){
132 - if(global == null)
133 - global = GlobalConfiguration.getInstance();
134 - return getSearcherAnalyzer(global.getLanguage(iid.getDBname()),exactCase);
135 -
 136+
 137+ protected static void setContextAnalyzer(PerFieldAnalyzerWrapper perFieldAnalyzer, String prefix, ContextAnalyzer analyzer) {
 138+ for(int i=1;i<=ContextAnalyzer.CONTEXT_GROUPS;i++){
 139+ perFieldAnalyzer.addAnalyzer(prefix+i,analyzer);
 140+ }
136141 }
137142
138 - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode){
139 - return getSearcherAnalyzer(langCode,false);
 143+
 144+ public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){
 145+ return getSearcherAnalyzer(iid,false);
140146 }
141147
142 - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode, boolean exactCase){
143 - return getSearcherAnalyzer(new FilterFactory(langCode),new FieldNameFactory(exactCase));
 148+ public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid, boolean exactCase){
 149+ return getSearcherAnalyzer(new FilterFactory(iid),new FieldNameFactory(exactCase));
144150 }
145151
146 - public static PerFieldAnalyzerWrapper getSpellCheckAnalyzer(String langCode, HashSet<String> stopWords){
147 - FilterFactory filters = new FilterFactory(langCode,FilterFactory.Type.SPELL_CHECK);
 152+ public static PerFieldAnalyzerWrapper getSpellCheckAnalyzer(IndexId iid, HashSet<String> stopWords){
 153+ FilterFactory filters = new FilterFactory(iid,FilterFactory.Type.SPELL_CHECK);
148154 filters.setStopWords(stopWords);
149155 return getSearcherAnalyzer(filters,new FieldNameFactory());
150156 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java
@@ -60,6 +60,13 @@
6161 return "related";
6262 }
6363
 64+ public String context(){
 65+ if(exactCase)
 66+ return "context_exact";
 67+ else
 68+ return "context";
 69+ }
 70+
6471 public String anchor(){
6572 if(exactCase)
6673 return "anchor_exact";
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
@@ -9,6 +9,7 @@
1010 import org.apache.lucene.analysis.Analyzer;
1111 import org.apache.lucene.analysis.Token;
1212 import org.apache.lucene.analysis.TokenStream;
 13+import org.wikimedia.lsearch.config.IndexId;
1314
1415 /**
1516 * Analyzer that builds a field with an array of keywords,
@@ -28,6 +29,7 @@
2930 static Logger log = Logger.getLogger(KeywordsAnalyzer.class);
3031 protected KeywordsTokenStream[] tokensBySize = null;
3132 protected String prefix;
 33+ protected IndexId iid;
3234
3335 /** number of field to be generated, e.g. keyword1 for single-word keywords,
3436 * keyword2 for two-word keywords, etc ... the last field has all the remaining keys
@@ -50,6 +52,7 @@
5153
5254 protected void init(ArrayList<String> keywords, FilterFactory filters, String prefix, boolean exactCase) {
5355 this.prefix = prefix;
 56+ this.iid = filters.getIndexId();
5457 tokensBySize = new KeywordsTokenStream[KEYWORD_LEVELS];
5558 if(keywords == null){
5659 // init empty token streams
@@ -63,7 +66,7 @@
6467 keywordsBySize.add(new ArrayList<String>());
6568 // arange keywords into a list by token number
6669 for(String k : keywords){
67 - ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,filters.getLanguage(),exactCase).parse();
 70+ ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,exactCase).parse();
6871 if(parsed.size() == 0)
6972 continue;
7073 else if(parsed.size() < KEYWORD_LEVELS)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java
@@ -11,6 +11,7 @@
1212 import org.apache.lucene.analysis.de.GermanStemFilter;
1313 import org.apache.lucene.analysis.snowball.SnowballFilter;
1414 import org.apache.lucene.analysis.th.ThaiWordFilter;
 15+import org.wikimedia.lsearch.config.IndexId;
1516
1617 /**
1718 * Make a language-dependent pair of filters. The custom filter is to be applied before the stemmer.
@@ -20,6 +21,7 @@
2122 */
2223 public class FilterFactory {
2324 protected String lang;
 25+ protected IndexId iid;
2426 protected String snowballName = null;
2527 protected boolean useStemmer,useLangFilter;
2628 protected Class stemmer = null;
@@ -33,18 +35,20 @@
3436 public enum Type { FULL, NO_STEM, SPELL_CHECK };
3537 protected Type type = null;
3638
37 - public FilterFactory(String lang){
38 - this(lang,Type.FULL);
 39+ public FilterFactory(IndexId iid){
 40+ this(iid,Type.FULL);
3941 }
4042
41 - public FilterFactory(String lang, Type type){
42 - this.lang = lang;
 43+ public FilterFactory(IndexId iid, Type type){
 44+ this.lang = iid.getLangCode();
 45+ this.iid = iid;
4346 this.type = type;
4447 init();
45 - noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useLangFilter,null,langFilter,additionalFilters);
 48+ noStemmerFilterFactory = new FilterFactory(iid,lang,snowballName,false,useLangFilter,null,langFilter,additionalFilters);
4649 }
4750
48 - public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useLangFilter, Class stemmer, Class langFilter, ArrayList<Class> additionalFilters) {
 51+ public FilterFactory(IndexId iid, String lang, String snowballName, boolean useStemmer, boolean useLangFilter, Class stemmer, Class langFilter, ArrayList<Class> additionalFilters) {
 52+ this.iid = iid;
4953 this.lang = lang;
5054 this.snowballName = snowballName;
5155 this.useStemmer = useStemmer;
@@ -193,6 +197,12 @@
194198 public void setStopWords(Set<String> stopWords){
195199 this.stopWords = stopWords;
196200 }
 201+
 202+ public IndexId getIndexId() {
 203+ return iid;
 204+ }
197205
198206
 207+
 208+
199209 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java
@@ -25,7 +25,7 @@
2626 */
2727 @Override
2828 public TokenStream tokenStream(String fieldName, String text) {
29 - wikitokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase);
 29+ wikitokenizer = new WikiTokenizer(text,filters.getIndexId(),exactCase);
3030 return super.tokenStream(fieldName,(Reader)null);
3131 }
3232
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/RelatedAnalyzer.java
@@ -21,6 +21,7 @@
2222
2323 public RelatedAnalyzer(ArrayList<RelatedTitle> related, int[] p, FilterFactory filters, String prefix, boolean exactCase) {
2424 this.prefix = prefix;
 25+ this.iid = filters.getIndexId();
2526 tokensBySize = new KeywordsTokenStream[RELATED_GROUPS];
2627 if(related == null || p == null){
2728 // init empty token streams
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ContextAnalyzer.java
@@ -0,0 +1,60 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+import java.io.IOException;
 5+import java.util.ArrayList;
 6+import java.util.Collection;
 7+
 8+import org.wikimedia.lsearch.analyzers.KeywordsAnalyzer.KeywordsTokenStream;
 9+import org.wikimedia.lsearch.beans.Title;
 10+import org.wikimedia.lsearch.ranks.Links;
 11+import org.wikimedia.lsearch.related.RelatedTitle;
 12+
 13+/**
 14+ * Contexts tokenized, with token gaps
 15+ *
 16+ * @author rainman
 17+ *
 18+ */
 19+public class ContextAnalyzer extends KeywordsAnalyzer {
 20+ static public int CONTEXT_GROUPS = 2;
 21+
 22+ static public int TOKEN_GAP = 100;
 23+
 24+ public ContextAnalyzer(Title title, Links links, ArrayList<RelatedTitle> related, int[] p, FilterFactory filters, String prefix, boolean exactCase) {
 25+ this.prefix = prefix;
 26+ this.iid = filters.getIndexId();
 27+ tokensBySize = new KeywordsTokenStream[CONTEXT_GROUPS];
 28+ if(related == null || p == null || title == null || links == null){
 29+ // init empty token streams
 30+ for(int i=0; i< CONTEXT_GROUPS; i++){
 31+ tokensBySize[i] = new KeywordsTokenStream(null,filters,exactCase,TOKEN_GAP);
 32+ }
 33+ return;
 34+ }
 35+ String key = title.getKey();
 36+ // split-up
 37+ ArrayList<ArrayList<String>> partitions = new ArrayList<ArrayList<String>>();
 38+ for(int i=0;i<CONTEXT_GROUPS;i++){
 39+ ArrayList<String> part = new ArrayList<String>();
 40+ for(int j=p[i];j<p[i+1];j++){
 41+ Title t = related.get(j).getRelated();
 42+ Collection<String> contexts;
 43+ try {
 44+ contexts = links.getContext(t.getKey(),key);
 45+ //System.out.println("CONTEXT "+t.getKey()+" -> "+key+" : "+contexts);
 46+ if(contexts != null)
 47+ part.addAll(contexts);
 48+ } catch (IOException e) {
 49+ log.warn("Cannot fetch context for "+key+" from "+t.getKey()+" : "+e.getMessage());
 50+ e.printStackTrace();
 51+ }
 52+
 53+ }
 54+ partitions.add(part);
 55+ }
 56+ for(int i=0; i< CONTEXT_GROUPS; i++){
 57+ tokensBySize[i] = new KeywordsTokenStream(partitions.get(i),filters,exactCase,TOKEN_GAP);
 58+ }
 59+ }
 60+
 61+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java
@@ -1,5 +1,7 @@
22 package org.wikimedia.lsearch.analyzers;
33
 4+import org.wikimedia.lsearch.config.IndexId;
 5+
46 /**
57 * Agregate class for FilterFactory and FieldNameFactory. This class
68 * contains methods used to build various fields of the index,
@@ -47,15 +49,15 @@
4850 public static enum Options { NONE, SPELL_CHECK };
4951
5052 /** Construct case-insensitive field builder with stemming */
51 - public FieldBuilder(String lang){
52 - this(lang,Case.IGNORE_CASE,Stemmer.USE_STEMMER,Options.NONE);
 53+ public FieldBuilder(IndexId iid){
 54+ this(iid,Case.IGNORE_CASE,Stemmer.USE_STEMMER,Options.NONE);
5355 }
5456
55 - public FieldBuilder(String lang, Case useCase){
56 - this(lang,useCase,Stemmer.USE_STEMMER,Options.NONE);
 57+ public FieldBuilder(IndexId iid, Case useCase){
 58+ this(iid,useCase,Stemmer.USE_STEMMER,Options.NONE);
5759 }
5860
59 - public FieldBuilder(String lang, Case useCase, Stemmer useStemmer, Options options){
 61+ public FieldBuilder(IndexId iid, Case useCase, Stemmer useStemmer, Options options){
6062 FilterFactory.Type type = FilterFactory.Type.FULL;
6163 if(options == Options.SPELL_CHECK)
6264 type = FilterFactory.Type.SPELL_CHECK;
@@ -63,7 +65,7 @@
6466 if(useCase == Case.EXACT_CASE){
6567 builders = new BuilderSet[2];
6668 builders[1] = new BuilderSet(
67 - new FilterFactory(lang,type).getNoStemmerFilterFactory(),
 69+ new FilterFactory(iid,type).getNoStemmerFilterFactory(),
6870 new FieldNameFactory(FieldNameFactory.EXACT_CASE));
6971 } else
7072 builders = new BuilderSet[1];
@@ -71,11 +73,11 @@
7274 // default factory, lowercase all data
7375 if(useStemmer == Stemmer.USE_STEMMER){
7476 builders[0] = new BuilderSet(
75 - new FilterFactory(lang,type),
 77+ new FilterFactory(iid,type),
7678 new FieldNameFactory());
7779 } else{
7880 builders[0] = new BuilderSet(
79 - new FilterFactory(lang,type).getNoStemmerFilterFactory(),
 81+ new FilterFactory(iid,type).getNoStemmerFilterFactory(),
8082 new FieldNameFactory());
8183 }
8284
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java
@@ -10,6 +10,7 @@
1111 import org.apache.log4j.Logger;
1212 import org.apache.lucene.analysis.Token;
1313 import org.apache.lucene.analysis.Tokenizer;
 14+import org.wikimedia.lsearch.config.IndexId;
1415
1516 /** Uses FastWikiTokenizerEngine to tokenize text */
1617 public class WikiTokenizer extends Tokenizer {
@@ -36,8 +37,8 @@
3738 * @param str
3839 */
3940
40 - public WikiTokenizer(String str, String lang, boolean exactCase){
41 - parser = new FastWikiTokenizerEngine(str,lang,exactCase);
 41+ public WikiTokenizer(String str, IndexId iid, boolean exactCase){
 42+ parser = new FastWikiTokenizerEngine(str,iid,exactCase);
4243 this.input = null;
4344 }
4445
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SplitAnalyzer.java
@@ -10,7 +10,7 @@
1111 import org.apache.lucene.analysis.Tokenizer;
1212 import org.wikimedia.lsearch.ranks.StringList;
1313
14 -/** Split the text by some specific char */
 14+/** Analyzes serialized StringLists into its components */
1515 public class SplitAnalyzer extends Analyzer {
1616 class SplitTokenStream extends Tokenizer {
1717 Iterator<String> it = null;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -9,6 +9,7 @@
1010
1111 import org.apache.commons.lang.WordUtils;
1212 import org.apache.lucene.analysis.Token;
 13+import org.wikimedia.lsearch.config.IndexId;
1314 import org.wikimedia.lsearch.util.Localization;
1415 import org.wikimedia.lsearch.util.UnicodeDecomposer;
1516
@@ -67,6 +68,7 @@
6869
6970 /** language code */
7071 private String language;
 72+ private IndexId iid;
7173 /** language code -> set (image namespace names) */
7274 private static Hashtable<String,HashSet<String>> imageLocalized = new Hashtable<String,HashSet<String>>();
7375 /** language code -> set (category namespace names) */
@@ -111,10 +113,11 @@
112114 }
113115 }
114116
115 - public FastWikiTokenizerEngine(String text, String lang, boolean exactCase){
 117+ public FastWikiTokenizerEngine(String text, IndexId iid, boolean exactCase){
116118 this.text = text.toCharArray();
117119 this.textString = text;
118 - this.language = lang;
 120+ this.language = iid.getLangCode();
 121+ this.iid = iid;
119122 this.exactCase = exactCase;
120123 textLength = text.length();
121124 init();
@@ -744,7 +747,7 @@
745748 else if(language!=null && language.length()!=0){
746749 HashSet<String> loc = imageLocalized.get(language);
747750 if(loc == null){
748 - loc = Localization.getLocalizedImage(language);
 751+ loc = Localization.getLocalizedImage(language,iid.getDBname());
749752 imageLocalized.put(language,loc);
750753 }
751754 if(loc.contains(prefix))
@@ -761,7 +764,7 @@
762765 else if(language!=null && language.length()!=0){
763766 HashSet<String> loc = categoryLocalized.get(language);
764767 if(loc == null){
765 - loc = Localization.getLocalizedCategory(language);
 768+ loc = Localization.getLocalizedCategory(language,iid.getDBname());
766769 categoryLocalized.put(language,loc);
767770 }
768771 if(loc.contains(prefix))
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Localization.java
@@ -8,6 +8,7 @@
99 import java.util.HashMap;
1010 import java.util.Hashtable;
1111 import java.util.HashSet;
 12+import java.util.Map;
1213 import java.util.Map.Entry;
1314
1415 import org.apache.log4j.Logger;
@@ -28,9 +29,16 @@
2930 protected static Object lock = new Object();
3031 /** Languages for which loading of localization failed */
3132 protected static HashSet<String> badLocalizations = new HashSet<String>();
 33+ /** Languages for which we loaded localication */
 34+ protected static HashSet<String> loadedLocalizations = new HashSet<String>();
3235 protected static HashSet<String> interwiki = null;
3336 /** lowecased canonical names of namespaces */
34 - protected static Hashtable<String,Integer> canonicalNamespaces = new Hashtable<String,Integer>();
 37+ protected static Hashtable<String,Integer> canonicalNamespaces = new Hashtable<String,Integer>();
 38+ /** dbname -> meta namespace name */
 39+ protected static Hashtable<String,String> metaNamespaces = new Hashtable<String,String>();
 40+ /** custom maps (for oai headers, etc..) dbname -> nsname -> nsindex */
 41+ protected static Hashtable<String,Hashtable<String,Integer>> customNamespaces = new Hashtable<String,Hashtable<String,Integer>>();
 42+
3543 static{
3644 canonicalNamespaces.put("media",-2);
3745 canonicalNamespaces.put("special",-1);
@@ -51,48 +59,72 @@
5260 canonicalNamespaces.put("category_talk",15);
5361 }
5462
 63+ /** set meta namespaces for specific db names */
 64+ public static void setMetaNamespace(Map<String,String> dbmeta){
 65+ synchronized(lock){
 66+ metaNamespaces.putAll(dbmeta);
 67+ }
 68+ }
 69+
5570 /** Add custom mapping not found in localization files from other source, e.g. project name, etc.. */
56 - public static void addCustomMapping(String namespace, int index, String langCode){
 71+ public static void addCustomMapping(String namespace, int index, String dbname){
5772 synchronized(lock){
58 - Hashtable<String,Integer> map = namespaces.get(langCode);
 73+ Hashtable<String,Integer> map = customNamespaces.get(dbname);
5974 if(map == null){
6075 map = new Hashtable<String,Integer>();
61 - namespaces.put(langCode,map);
 76+ customNamespaces.put(dbname,map);
6277 }
6378 map.put(namespace.toLowerCase(),index);
6479 }
6580 }
66 -
67 - public static HashSet<String> getLocalizedImage(String langCode){
68 - return getLocalizedNamespace(langCode,6);
 81+ /** Get a new hashset of localized image namespace names */
 82+ public static HashSet<String> getLocalizedImage(String langCode, String dbname){
 83+ return getLocalizedNamespace(langCode,6,dbname);
6984 }
70 -
71 - public static HashSet<String> getLocalizedCategory(String langCode){
72 - return getLocalizedNamespace(langCode,14);
 85+ /** Get a new hashset of localized category namespace names */
 86+ public static HashSet<String> getLocalizedCategory(String langCode, String dbname){
 87+ return getLocalizedNamespace(langCode,14,dbname);
7388 }
7489
75 - public static HashSet<String> getLocalizedNamespace(String langCode, int nsId){
 90+ public static HashSet<String> getLocalizedNamespace(String langCode, int nsId, String dbname){
7691 synchronized (lock){
 92+ HashSet<String> res = new HashSet<String>();
7793 langCode = langCode.toLowerCase();
78 - if(namespaces.get(langCode)==null){
79 - if(badLocalizations.contains(langCode) || !readLocalization(langCode))
80 - return new HashSet<String>();
 94+ if(namespaces.get(langCode)==null)
 95+ readLocalization(langCode);
 96+
 97+ // get namespaces from message files
 98+ res.addAll(collect(namespaces.get(langCode),nsId));
 99+ // get db-specific names, like meta namespaces or ones obtained via oai or other ways
 100+ if(dbname != null){
 101+ res.addAll(collect(customNamespaces.get(dbname),nsId));
 102+ if(nsId == 4 && metaNamespaces.containsKey(dbname))
 103+ res.add(metaNamespaces.get(dbname));
81104 }
82 - return collect(namespaces.get(langCode),nsId);
 105+ return res;
83106 }
84107 }
85108
86109 /** Get mapping namespace_name (lowercase) -> namespace_index */
87 - public static HashMap<String,Integer> getLocalizedNamespaces(String langCode){
 110+ public static HashMap<String,Integer> getLocalizedNamespaces(String langCode, String dbname){
88111 synchronized (lock){
89112 HashMap<String,Integer> ret = new HashMap<String,Integer>();
90113 ret.putAll(canonicalNamespaces);
91114 langCode = langCode.toLowerCase();
92 - if(namespaces.get(langCode)==null){
93 - if(badLocalizations.contains(langCode) || !readLocalization(langCode))
94 - return ret;
 115+ if(namespaces.get(langCode)==null)
 116+ readLocalization(langCode);
 117+ // localization from messages files
 118+ if(namespaces.containsKey(langCode))
 119+ ret.putAll(namespaces.get(langCode));
 120+ // db-specific
 121+ if(dbname != null){
 122+ // meta namespaces
 123+ if(metaNamespaces.containsKey(dbname))
 124+ ret.put(metaNamespaces.get(dbname),4);
 125+ // custom
 126+ if(customNamespaces.containsKey(dbname))
 127+ ret.putAll(customNamespaces.get(dbname));
95128 }
96 - ret.putAll(namespaces.get(langCode));
97129 return ret;
98130 }
99131 }
@@ -107,6 +139,8 @@
108140 /** Collect all the names with some certain namespace id */
109141 protected static HashSet<String> collect(Hashtable<String,Integer> ns, int nsid) {
110142 HashSet<String> ret = new HashSet<String>();
 143+ if(ns == null)
 144+ return ret;
111145 for(Entry<String,Integer> e : ns.entrySet()){
112146 if(e.getValue().intValue() == nsid)
113147 ret.add(e.getKey());
@@ -123,6 +157,10 @@
124158 /** Level is recursion level (to detect infinite recursion if language
125159 * defines itself as a fallback) */
126160 protected static boolean readLocalization(String langCode, int level){
 161+ if(badLocalizations.contains(langCode))
 162+ return false; // failed previously
 163+ if(loadedLocalizations.contains(langCode))
 164+ return true; // already loaded
127165 Configuration config = Configuration.open();
128166 if(langCode == null || langCode.equals(""))
129167 return false;
@@ -158,6 +196,7 @@
159197 if(ns!=null && ns.size()!=0){
160198 namespaces.put(langCode.toLowerCase(),ns);
161199 log.debug("Succesfully loaded localization for "+langCode.toLowerCase());
 200+ loadedLocalizations.add(langCode);
162201 return true;
163202 } else{ // maybe a fallback language is defines instead
164203 String fallback = parser.getFallBack(text);
@@ -165,6 +204,7 @@
166205 fallback = fallback.replace('-','_');
167206 boolean succ = readLocalization(fallback,level+1);
168207 if(succ){
 208+ loadedLocalizations.add(fallback);
169209 namespaces.put(langCode.toLowerCase(),namespaces.get(fallback.toLowerCase()));
170210 redirects.put(langCode.toLowerCase(),redirects.get(fallback.toLowerCase()));
171211 }
@@ -216,9 +256,13 @@
217257 int end = line.indexOf("]]");
218258 if(begin != -1 && end != -1 && end > begin){
219259 String redirectText = text.substring(begin+2,end);
 260+ int pipe = redirectText.indexOf('|');
 261+ if(pipe != -1)
 262+ redirectText = redirectText.substring(0,pipe);
220263 int fragment = redirectText.lastIndexOf('#');
221264 if(fragment != -1)
222265 redirectText = redirectText.substring(0,fragment);
 266+ redirectText = redirectText.replace('_',' ');
223267 return redirectText;
224268 }
225269 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/FSUtils.java
@@ -0,0 +1,139 @@
 2+package org.wikimedia.lsearch.util;
 3+
 4+import java.io.File;
 5+import java.io.IOException;
 6+
 7+/**
 8+ * Various abstraction of file system operations: delete dirs,
 9+ * make soft/hard links ...
 10+ *
 11+ * Based on FileUtil.java from Lucene Hadoop project (Apache Licence)
 12+ * @author rainman
 13+ *
 14+ */
 15+public class FSUtils {
 16+ public static final String PATH_SEP = System.getProperty("file.separator");
 17+
 18+ enum OSType { OS_TYPE_UNIX, OS_TYPE_WINXP };
 19+
 20+ protected static String[] hardLinkCommand;
 21+
 22+ static {
 23+ switch(getOSType()) {
 24+ case OS_TYPE_WINXP:
 25+ hardLinkCommand = new String[] {"fsutil","hardlink","create", null, null};
 26+ break;
 27+ case OS_TYPE_UNIX:
 28+ default:
 29+ hardLinkCommand = new String[] {"ln", null, null};
 30+ }
 31+ }
 32+
 33+ static OSType getOSType() {
 34+ String osName = System.getProperty("os.name");
 35+ if (osName.indexOf("Windows") >= 0 &&
 36+ (osName.indexOf("XP") >= 0 || osName.indexOf("2003") >= 0))
 37+ return OSType.OS_TYPE_WINXP;
 38+ else
 39+ return OSType.OS_TYPE_UNIX;
 40+ }
 41+
 42+ /**
 43+ * Create a hardlink in the filesystem.
 44+ *
 45+ * @param target
 46+ * @param linkName
 47+ * @throws IOException
 48+ */
 49+ public static void createHardLink(File target, File linkName) throws IOException {
 50+ int len = hardLinkCommand.length;
 51+ hardLinkCommand[len-2] = target.getCanonicalPath();
 52+ hardLinkCommand[len-1] = linkName.getCanonicalPath();
 53+ Command.exec(hardLinkCommand);
 54+ }
 55+
 56+ /**
 57+ * Create hard links recursively if the target is a directory
 58+ *
 59+ * @param target
 60+ * @param linkname
 61+ * @throws IOException
 62+ */
 63+ public static void createHardLinkRecursive(String target, String linkname) throws IOException {
 64+ File file = new File(target);
 65+ if(!file.exists())
 66+ throw new IOException("Trying to hardlink nonexisting file "+target);
 67+ if(file.isDirectory()){
 68+ File[] files = file.listFiles();
 69+ for(File f: files)
 70+ createHardLinkRecursive(format(new String[]{target,f.getName()}),format(new String[] {linkname,f.getName()}));
 71+ } else
 72+ createHardLink(new File(target),new File(linkname));
 73+ }
 74+
 75+
 76+ /**
 77+ * Create a soft link between a src and destination
 78+ * only on a local disk. HDFS does not support this
 79+ * @param target the target for symlink
 80+ * @param linkname the symlink
 81+ */
 82+ public static void createSymLink(String target, String linkname) throws IOException{
 83+ String cmd = "ln -s " + target + " " + linkname;
 84+ Command.exec(cmd);
 85+ }
 86+
 87+ /**
 88+ * Append path parts via the systems path separator.
 89+ * I.e. {"/usr/local", "search" } -> "/usr/local/search"
 90+ * @param parts
 91+ */
 92+ public static String format(String[] parts){
 93+ StringBuilder sb = new StringBuilder();
 94+ boolean first = true;
 95+ for(String p : parts){
 96+ if(!first && p.startsWith(PATH_SEP))
 97+ p = p.substring(PATH_SEP.length());
 98+ sb.append(p);
 99+ if(!p.endsWith(PATH_SEP))
 100+ sb.append(PATH_SEP);
 101+ if(first)
 102+ first = false;
 103+ }
 104+ return sb.toString();
 105+ }
 106+
 107+ /**
 108+ * Construct a file from parts of path
 109+ * @param parts
 110+ */
 111+ public static File formatFile(String[] parts){
 112+ return new File(format(parts));
 113+ }
 114+
 115+ /**
 116+ * Delete a file recursively
 117+ *
 118+ * @param file
 119+ */
 120+ public static void deleteRecursive(File file){
 121+ if(!file.exists())
 122+ return;
 123+ else if(file.isDirectory()){
 124+ File[] files = file.listFiles();
 125+ for(File f: files)
 126+ deleteRecursive(f);
 127+ file.delete();
 128+ } else{
 129+ file.delete();
 130+ }
 131+ }
 132+
 133+ /** Delete single file */
 134+ public static void delete(String path) {
 135+ File f = new File(path);
 136+ if(f.exists()) // if doesn't exist don't complain
 137+ f.delete();
 138+ }
 139+
 140+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Command.java
@@ -24,10 +24,17 @@
2525 }
2626
2727 public static void exec(String command) throws IOException {
 28+ exec(new String[] {command});
 29+ }
 30+
 31+ public static void exec(String[] command) throws IOException {
2832 Process p = null;
2933 log.debug("Executing shell command "+command);
3034 try {
31 - p = Runtime.getRuntime().exec(command);
 35+ if(command.length == 1)
 36+ p = Runtime.getRuntime().exec(command[0]);
 37+ else
 38+ p = Runtime.getRuntime().exec(command);
3239 p.waitFor();
3340 if(p.exitValue()!=0){
3441 log.warn("Got exit value "+p.exitValue()+" while executing "+command);
@@ -43,6 +50,8 @@
4451 throw new IOException("Interrupted");
4552 } finally {
4653 closeStreams(p);
 54+ if(p != null)
 55+ p.destroy();
4756 }
4857 }
4958
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/PHPParser.java
@@ -162,6 +162,24 @@
163163 return servers;
164164 }
165165
 166+ /** Get wgMetaNamespace (dbname->metans name) from InitialiseSettings */
 167+ public Hashtable<String,String> getMetaNamespace(String text){
 168+ text = text.replaceAll("(#.*)",""); // strip comments
 169+ Hashtable<String,String> meta = new Hashtable<String,String>();
 170+
 171+ int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
 172+ Pattern wgmeta = Pattern.compile("[\"']wgMetaNamespace[\"']\\s*=>\\s*array\\s*\\((.*?)\\)",flags);
 173+ Pattern entry = Pattern.compile("[\"'](.*?)[\"']\\s*=>\\s*[\"'](.*?)[\"']",flags);
 174+ Matcher matcher = wgmeta.matcher(text);
 175+ while(matcher.find()){
 176+ Matcher me = entry.matcher(matcher.group(1));
 177+ while(me.find()){
 178+ meta.put(me.group(1),me.group(2));
 179+ }
 180+ }
 181+ return meta;
 182+ }
 183+
166184 /** Get wgNamespacesToBeSearchedDefault from InitialiseSettings */
167185 public Hashtable<String,NamespaceFilter> getDefaultSearch(String text){
168186 text = text.replaceAll("(#.*)",""); // strip comments
@@ -276,6 +294,7 @@
277295 System.out.println(p.getLanguages(initset));
278296 System.out.println(p.getServer(initset));
279297 System.out.println(p.getDefaultSearch(initset));
 298+ System.out.println(p.getMetaNamespace(initset));
280299
281300
282301 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/TitleReader.java
@@ -1,67 +0,0 @@
2 -package org.wikimedia.lsearch.ranks;
3 -
4 -import java.io.IOException;
5 -import java.util.ArrayList;
6 -import java.util.HashMap;
7 -import java.util.HashSet;
8 -import java.util.Iterator;
9 -import java.util.Map.Entry;
10 -
11 -import org.mediawiki.importer.DumpWriter;
12 -import org.mediawiki.importer.Page;
13 -import org.mediawiki.importer.Revision;
14 -import org.mediawiki.importer.Siteinfo;
15 -import org.wikimedia.lsearch.beans.ArticleLinks;
16 -import org.wikimedia.lsearch.beans.Title;
17 -import org.wikimedia.lsearch.config.IndexId;
18 -import org.wikimedia.lsearch.util.Localization;
19 -
20 -/**
21 - * Read a HashSet of titles from dump
22 - *
23 - * @author rainman
24 - *
25 - */
26 -public class TitleReader implements DumpWriter{
27 - Page page;
28 - Revision revision;
29 - Links links;
30 - protected String langCode;
31 -
32 - public TitleReader(String langCode, IndexId iid) throws IOException{
33 - this.langCode = langCode;
34 - this.links = Links.createNew(iid);
35 - }
36 -
37 - public void writeRevision(Revision revision) throws IOException {
38 - this.revision = revision;
39 - }
40 - public void writeStartPage(Page page) throws IOException {
41 - this.page = page;
42 - }
43 - public void writeEndPage() throws IOException {
44 - String key = page.Title.Namespace+":"+page.Title.Text;
45 - links.addTitle(new Title(key));
46 - }
47 - public Links getLinks() {
48 - return links;
49 - }
50 - public void close() throws IOException {
51 - // nop
52 - }
53 - public void writeEndWiki() throws IOException {
54 - // nop
55 - }
56 - public void writeSiteinfo(Siteinfo info) throws IOException {
57 - // write siteinfo to localization
58 - Iterator it = info.Namespaces.orderedEntries();
59 - while(it.hasNext()){
60 - Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
61 - Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);
62 - links.addToNamespaceMap(pair.getValue(),pair.getKey());
63 - }
64 - }
65 - public void writeStartWiki() throws IOException {
66 - // nop
67 - }
68 -}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java
@@ -3,6 +3,8 @@
44 import java.io.IOException;
55 import java.util.HashMap;
66 import java.util.HashSet;
 7+import java.util.Iterator;
 8+import java.util.Map.Entry;
79 import java.util.regex.Matcher;
810 import java.util.regex.Pattern;
911
@@ -35,12 +37,14 @@
3638 Links links;
3739 HashSet<String> interwiki;
3840 String langCode;
 41+ IndexId iid;
3942
40 - public LinkReader(Links links, String langCode){
 43+ public LinkReader(Links links, IndexId iid, String langCode){
4144 this.links = links;
4245 if(langCode == null || langCode.equals(""))
4346 langCode = "en";
4447 this.langCode = langCode;
 48+ this.iid = iid;
4549 interwiki = Localization.getInterwiki();
4650 }
4751 public void writeRevision(Revision revision) throws IOException {
@@ -50,10 +54,23 @@
5155 this.page = page;
5256 }
5357 public void writeEndPage() throws IOException {
54 - links.addArticleInfo(revision.Text,new Title(page.Title.Namespace,page.Title.Text));
 58+ Title t = new Title(page.Title.Namespace,page.Title.Text);
 59+ try{
 60+ links.addArticleInfo(revision.Text,t);
 61+ } catch(Exception e){
 62+ log.error("Error adding article "+t+" : "+e.getMessage());
 63+ e.printStackTrace();
 64+ }
5565 }
5666 public void writeSiteinfo(Siteinfo info) throws IOException {
5767 siteinfo = info;
 68+ // write siteinfo to localization
 69+ Iterator it = info.Namespaces.orderedEntries();
 70+ while(it.hasNext()){
 71+ Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
 72+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),iid.getDBname());
 73+ links.addToNamespaceMap(pair.getValue(),pair.getKey());
 74+ }
5875 }
5976 public void close() throws IOException {
6077 // nop
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java
@@ -1,6 +1,11 @@
22 package org.wikimedia.lsearch.ranks;
33
 4+import java.io.ByteArrayInputStream;
 5+import java.io.ByteArrayOutputStream;
46 import java.io.IOException;
 7+import java.io.ObjectInputStream;
 8+import java.io.ObjectOutputStream;
 9+import java.io.StringWriter;
510 import java.util.ArrayList;
611 import java.util.Collection;
712 import java.util.HashMap;
@@ -15,6 +20,9 @@
1621 import org.apache.lucene.analysis.SimpleAnalyzer;
1722 import org.apache.lucene.document.Document;
1823 import org.apache.lucene.document.Field;
 24+import org.apache.lucene.document.FieldSelector;
 25+import org.apache.lucene.document.SetBasedFieldSelector;
 26+import org.apache.lucene.index.CorruptIndexException;
1927 import org.apache.lucene.index.IndexReader;
2028 import org.apache.lucene.index.IndexWriter;
2129 import org.apache.lucene.index.Term;
@@ -29,6 +37,7 @@
3038 import org.wikimedia.lsearch.config.IndexId;
3139 import org.wikimedia.lsearch.index.WikiIndexModifier;
3240 import org.wikimedia.lsearch.related.CompactArticleLinks;
 41+import org.wikimedia.lsearch.search.NamespaceFilter;
3342 import org.wikimedia.lsearch.spell.api.Dictionary;
3443 import org.wikimedia.lsearch.spell.api.LuceneDictionary;
3544 import org.wikimedia.lsearch.spell.api.Dictionary.Word;
@@ -40,61 +49,93 @@
4150 protected String langCode;
4251 protected IndexWriter writer = null;
4352 protected HashMap<String,Integer> nsmap = null;
44 - protected HashSet<String> interwiki = new HashSet<String>();
 53+ protected HashSet<String> interwiki;
 54+ protected HashSet<String> categoryLocalized;
 55+ protected HashSet<String> imageLocalized;
4556 protected IndexReader reader = null;
4657 protected String path;
47 - protected enum State { MODIFIED_TITLES, FLUSHED, MODIFIED_ARTICLES, READ };
 58+ protected enum State { FLUSHED, WRITE, MODIFIED, READ };
4859 protected State state;
49 - protected Directory directory;
 60+ protected Directory directory = null;
 61+ protected NamespaceFilter nsf; // default search
 62+ protected ObjectCache cache;
 63+ //protected ObjectCache refCache;
 64+ protected FieldSelector keyOnly,redirectOnly,contextOnly,linksOnly;
5065
51 - private Links(IndexId iid){
 66+ private Links(IndexId iid, String path, IndexWriter writer) throws CorruptIndexException, IOException{
 67+ this.writer = writer;
 68+ this.path = path;
5269 this.iid = iid;
53 - this.langCode = GlobalConfiguration.getInstance().getLanguage(iid);
 70+ GlobalConfiguration global = GlobalConfiguration.getInstance();
 71+ this.langCode = global.getLanguage(iid);
 72+ String dbname = iid.getDBname();
 73+ nsmap = Localization.getLocalizedNamespaces(langCode,dbname);
 74+ interwiki = Localization.getInterwiki();
 75+ categoryLocalized = Localization.getLocalizedCategory(langCode,dbname);
 76+ imageLocalized = Localization.getLocalizedImage(langCode,dbname);
 77+ state = State.FLUSHED;
 78+ initWriter(writer);
 79+ //reader = IndexReader.open(path);
 80+ nsf = global.getDefaultNamespace(iid);
 81+ cache = new ObjectCache(10000);
 82+ // init cache manager
 83+ /*CacheManager manager = CacheManager.create();
 84+ cache = new Cache("links", 5000, false, false, 5, 2);
 85+ manager.addCache(cache); */
 86+ keyOnly = makeSelector("article_key");
 87+ redirectOnly = makeSelector("redirect");
 88+ contextOnly = makeSelector("context");
 89+ linksOnly = makeSelector("links");
5490 }
5591
56 - public static Links openExisting(IndexId iid) throws IOException{
57 - Links links = new Links(iid);
58 - links.path = iid.getTempPath();
59 - log.info("Using index at "+links.path);
60 - links.writer = WikiIndexModifier.openForWrite(links.path,false);
61 - initWriter(links.writer);
62 - links.reader = IndexReader.open(links.path);
63 - links.nsmap = Localization.getLocalizedNamespaces(links.langCode);
64 - links.interwiki = Localization.getInterwiki();
65 - links.state = State.FLUSHED;
66 - links.directory = links.writer.getDirectory();
67 - return links;
 92+ protected FieldSelector makeSelector(String field){
 93+ HashSet<String> onlySet = new HashSet<String>();
 94+ onlySet.add(field);
 95+ return new SetBasedFieldSelector(onlySet, new HashSet<String>());
6896 }
6997
70 - private static void initWriter(IndexWriter writer) {
71 - writer.setMergeFactor(20);
72 - writer.setMaxBufferedDocs(500);
73 - writer.setUseCompoundFile(true);
 98+ private void initWriter(IndexWriter writer) {
 99+ if(writer != null){
 100+ writer.setMergeFactor(20);
 101+ writer.setMaxBufferedDocs(500);
 102+ writer.setUseCompoundFile(true);
 103+ if(directory == null)
 104+ directory = writer.getDirectory();
 105+ }
74106 }
75 -
 107+
 108+ /** Open the index path for updates */
 109+ public static Links openForModification(IndexId iid) throws IOException{
 110+ iid = iid.getLinks();
 111+ String path = iid.getIndexPath();
 112+ log.info("Using index at "+path);
 113+ IndexWriter writer = WikiIndexModifier.openForWrite(path,false);
 114+ return new Links(iid,path,writer);
 115+ }
 116+
 117+ /** Open index at path for reading */
 118+ public static Links openForRead(IndexId iid, String path) throws IOException {
 119+ iid = iid.getLinks();
 120+ log.info("Opening for read "+path);
 121+ return new Links(iid,path,null);
 122+ }
 123+
 124+ /** Create new in the import path */
76125 public static Links createNew(IndexId iid) throws IOException{
77 - Links links = new Links(iid);
78 - links.path = iid.getTempPath();
79 - log.info("Making index at "+links.path);
80 - links.writer = WikiIndexModifier.openForWrite(links.path,true);
81 - links.reader = IndexReader.open(links.path);
82 - links.nsmap = Localization.getLocalizedNamespaces(links.langCode);
83 - links.interwiki = Localization.getInterwiki();
84 - links.state = State.FLUSHED;
85 - links.directory = links.writer.getDirectory();
 126+ iid = iid.getLinks();
 127+ String path = iid.getImportPath();
 128+ log.info("Making index at "+path);
 129+ IndexWriter writer = WikiIndexModifier.openForWrite(path,true);
 130+ Links links = new Links(iid,path,writer);
86131 return links;
87132 }
88133
 134+ /** Create new index in memory (RAMDirectory) */
89135 public static Links createNewInMemory(IndexId iid) throws IOException{
90 - Links links = new Links(iid);
91 - links.path = iid.getTempPath();
92 - log.info("Making index at "+links.path);
93 - links.writer = new IndexWriter(new RAMDirectory(),new SimpleAnalyzer(),true);
94 - links.reader = IndexReader.open(links.path);
95 - links.nsmap = Localization.getLocalizedNamespaces(links.langCode);
96 - links.interwiki = Localization.getInterwiki();
97 - links.state = State.FLUSHED;
98 - links.directory = links.writer.getDirectory();
 136+ iid = iid.getLinks();
 137+ log.info("Making index in memory");
 138+ IndexWriter writer = new IndexWriter(new RAMDirectory(),new SimpleAnalyzer(),true);
 139+ Links links = new Links(iid,null,writer);
99140 return links;
100141 }
101142
@@ -105,23 +146,21 @@
106147 }
107148 }
108149
 150+ /** Add a custom namespace mapping */
109151 public void addToNamespaceMap(String namespace, int index){
110152 nsmap.put(namespace.toLowerCase(),index);
111153 }
112154
113 - /** Write all changes, call after batch-adding of titles and articles
 155+ /** Write all changes, optimize/close everything
114156 * @throws IOException */
115157 public void flush() throws IOException{
116158 // close & optimize
117 - reader.close();
 159+ if(reader != null)
 160+ reader.close();
118161 if(writer != null){
119162 writer.optimize();
120163 writer.close();
121164 }
122 - // reopen
123 - writer = new IndexWriter(directory, new SimpleAnalyzer(), false);
124 - initWriter(writer);
125 - reader = IndexReader.open(path);
126165 state = State.FLUSHED;
127166 }
128167
@@ -130,41 +169,71 @@
131170 * Can still read.
132171 * @throws IOException
133172 */
134 - public void flushForRead() throws IOException{
 173+ protected void flushForRead() throws IOException{
135174 // close & optimize
136 - reader.close();
137 - writer.optimize();
138 - writer.close();
 175+ if(reader != null)
 176+ reader.close();
 177+ if(writer != null){
 178+ writer.optimize();
 179+ writer.close();
 180+ }
 181+ log.debug("Opening index reader");
139182 // reopen
140183 reader = IndexReader.open(path);
141184 writer = null;
142185 state = State.READ;
143186 }
144187
145 - /** Add a title to enable proper link analysis when adding articles
146 - * @throws IOException */
147 - public void addTitle(Title t) throws IOException{
148 - Document doc = new Document();
149 - doc.add(new Field("namespace",Integer.toString(t.getNamespace()),Field.Store.YES,Field.Index.UN_TOKENIZED));
150 - doc.add(new Field("title",t.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED));
151 - doc.add(new Field("title_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
152 - writer.addDocument(doc);
153 - state = State.MODIFIED_TITLES;
 188+ /** Open the writer, and close the reader (if any) */
 189+ protected void openForWrite() throws IOException{
 190+ if(reader != null)
 191+ reader.close();
 192+ if(writer == null){
 193+ if(directory == null)
 194+ throw new RuntimeException("Opened for read, but trying to write");
 195+ writer = new IndexWriter(directory,new SimpleAnalyzer(),false);
 196+ initWriter(writer);
 197+ reader = null;
 198+ state = State.WRITE;
 199+ }
154200 }
155201
 202+ protected void ensureRead() throws IOException {
 203+ if(state != State.READ)
 204+ flushForRead();
 205+ }
 206+
 207+ protected void ensureWrite() throws IOException {
 208+ if(writer == null)
 209+ openForWrite();
 210+ }
 211+
 212+ /** Modify existing article links info */
 213+ public void modifyArticleInfo(String text, Title t) throws IOException{
 214+ ensureWrite();
 215+ writer.deleteDocuments(new Term("article_key",t.getKey()));
 216+ addArticleInfo(text,t);
 217+ }
 218+
156219 /** Add links and other info from article
157220 * @throws IOException */
158221 public void addArticleInfo(String text, Title t) throws IOException{
159 - if(state == State.MODIFIED_TITLES)
160 - flush();
 222+ ensureWrite();
161223 Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]");
162224 int namespace = t.getNamespace();
163225 Matcher matcher = linkPat.matcher(text);
164226 int ns; String title;
165227 boolean escaped;
 228+
166229 HashSet<String> pagelinks = new HashSet<String>();
167 - HashSet<String> linkkeys = new HashSet<String>();
 230+ // article link -> contexts
 231+ HashMap<String,ArrayList<String>> contextMap = new HashMap<String,ArrayList<String>>();
168232
 233+ // use context only for namespace in default search
 234+ boolean useContext = nsf.contains(t.getNamespace());
 235+
 236+ ContextParser cp = new ContextParser(text,imageLocalized,categoryLocalized,interwiki);
 237+
169238 Title redirect = Localization.getRedirectTitle(text,langCode);
170239 String redirectsTo = null;
171240 if(redirect != null){
@@ -172,9 +241,8 @@
173242 } else {
174243 while(matcher.find()){
175244 String link = matcher.group(1);
176 - String anchor = matcher.group(2);
177 - if(anchor != null && anchor.length()>1 && anchor.substring(1).equalsIgnoreCase(title(link)))
178 - anchor = null; // anchor same as link text
 245+ ContextParser.Context context = useContext? cp.getNext(matcher.start(1)) : null;
 246+
179247 int fragment = link.lastIndexOf('#');
180248 if(fragment != -1)
181249 link = link.substring(0,fragment);
@@ -204,156 +272,107 @@
205273 }
206274 if(ns == 0 && namespace!=0)
207275 continue; // skip links from other namespaces into the main namespace
208 - String target = findTargetLink(ns,title);
 276+ String target = findTargetLink(ns,title);
209277 if(target != null){
210 - //System.out.println("Found "+link);
211 - linkkeys.add(target); // for outlink storage
212 - pagelinks.add(target+"|"); // for backlinks
213 - if(anchor != null && !"|".equals(anchor))
214 - pagelinks.add(target+anchor); // for efficient anchortext extraction
 278+ int targetNs = Integer.parseInt(target.substring(0,target.indexOf(':')));
 279+ pagelinks.add(target); // for outlink storage
 280+ // register context of this link
 281+ if(context != null && nsf.contains(targetNs)){
 282+ ArrayList<String> ct = contextMap.get(target);
 283+ if(ct==null){
 284+ ct = new ArrayList<String>();
 285+ contextMap.put(target,ct);
 286+ }
 287+ ct.add(context.get(text));
 288+ }
215289 }
216290 }
217291 }
218292 // index article
219 - StringList sl = new StringList(pagelinks);
220 - StringList lk = new StringList(linkkeys);
 293+ StringList lk = new StringList(pagelinks);
221294 Analyzer an = new SplitAnalyzer();
222295 Document doc = new Document();
223 - doc.add(new Field("namespace",t.getNamespaceAsString(),Field.Store.YES,Field.Index.UN_TOKENIZED));
224 - doc.add(new Field("title",t.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED));
225296 doc.add(new Field("article_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
226297 if(redirectsTo != null)
227 - doc.add(new Field("redirect",redirectsTo,Field.Store.YES,Field.Index.UN_TOKENIZED));
 298+ doc.add(new Field("redirect",redirectsTo+"|"+t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
228299 else{
229 - doc.add(new Field("links",sl.toString(),Field.Store.NO,Field.Index.TOKENIZED));
230 - doc.add(new Field("links_stored",lk.toString(),Field.Store.YES,Field.Index.TOKENIZED));
 300+ doc.add(new Field("links",lk.toString(),Field.Store.COMPRESS,Field.Index.TOKENIZED));
231301 }
 302+ if(contextMap.size() != 0){
 303+ /*for(Entry<String,ArrayList<String>> e : contextMap.entrySet()){
 304+ Document con = new Document();
 305+ con.add(new Field("context_key",e.getKey()+"|"+t.getKey(),Field.Store.NO,Field.Index.UN_TOKENIZED));
 306+ con.add(new Field("context",new StringList(e.getValue()).toString(),Field.Store.COMPRESS,Field.Index.NO));
 307+ writer.addDocument(con,an);
 308+ }*/
 309+ // serialize the java object (contextMap) into context field
 310+ //ByteArrayOutputStream ba = new ByteArrayOutputStream();
 311+ //ObjectOutputStream ob = new ObjectOutputStream(ba);
 312+ //ob.writeObject(contextMap);
 313+ //doc.add(new Field("context",ba.toByteArray(),Field.Store.COMPRESS));
 314+ doc.add(new Field("context",new StringMap(contextMap).serialize(),Field.Store.COMPRESS));
 315+ }
232316
233317 writer.addDocument(doc,an);
234 - state = State.MODIFIED_ARTICLES;
 318+ state = State.MODIFIED;
235319 }
236 - public static HashSet<Character> separators = new HashSet<Character>();
237 - static{
238 - separators.add(' ');
239 - separators.add('\r');
240 - separators.add('\n');
241 - separators.add('\t');
242 - separators.add(':');
243 - separators.add('(');
244 - separators.add(')');
245 - separators.add('[');
246 - separators.add(']');
247 - separators.add('.');
248 - separators.add(',');
249 - separators.add(':');
250 - separators.add(';');
251 - separators.add('"');
252 - separators.add('+');
253 - separators.add('*');
254 - separators.add('!');
255 - separators.add('~');
256 - separators.add('$');
257 - separators.add('%');
258 - separators.add('^');
259 - separators.add('&');
260 - separators.add('_');
261 - separators.add('=');
262 - separators.add('|');
263 - separators.add('\\');
264 - }
265320
266 - /**
267 - * Find a sentance boundaries
268 - *
269 - * @param text - raw text
270 - * @param start - start index to search from
271 - * @param reverse - if true, will lookup in reverse
272 - * @param max - radius of search (if no boundary is found return last wordbreak)
273 - * @return
274 - */
275 - protected int findSentance(char[] text, int start, boolean reverse, int max){
276 - int inc = (reverse)? -1 : 1;
277 - int count = 0;
278 - int wordbreak = start;
279 - int i = start;
280 - for(;i>0 && i<text.length;i+=inc){
281 - char c = text[i];
282 - if(c == '.')
283 - return i;
284 - else if(c == '*' && ((i>1 && text[i-1]=='\n') || i==0))
285 - return i;
286 - else if(separators.contains(c))
287 - wordbreak = i;
288 - if(count >= max)
289 - return wordbreak; // more than max chars away, return the latest wordbreak
290 - count ++;
291 - }
292 - return i;
293 - }
294 -
295 - /** Find surrounding for a link - extract sentances, list items .... */
296 - protected String findContext(char[] text, int start, int end){
297 - // TODO: implement
298 - return null;
299 - }
300 -
301321 /** Find the target key to title (ns:title) to which the links is pointing to
302322 * @throws IOException */
303323 protected String findTargetLink(int ns, String title) throws IOException{
304324 String key;
305325 if(title.length() == 0)
306326 return null;
307 - // try exact match
308 - key = ns+":"+title;
309 - if(reader.docFreq(new Term("title_key",key)) != 0)
310 - return key;
311 - // try lowercase
312 - key = ns+":"+title.toLowerCase();
313 - if(reader.docFreq(new Term("title_key",key)) != 0)
314 - return key;
315 - // try lowercase with first letter upper case
 327+
 328+ // first letter uppercase
316329 if(title.length()==1)
317330 key = ns+":"+title.toUpperCase();
318331 else
319 - key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase();
320 - if(reader.docFreq(new Term("title_key",key)) != 0)
321 - return key;
322 - // try title case
323 - key = ns+":"+WordUtils.capitalize(title);
324 - if(reader.docFreq(new Term("title_key",key)) != 0)
325 - return key;
326 - // try upper case
327 - key = ns+":"+title.toUpperCase();
328 - if(reader.docFreq(new Term("title_key",key)) != 0)
329 - return key;
330 - // try capitalizing at word breaks
331 - key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});
332 - if(reader.docFreq(new Term("title_key",key)) != 0)
333 - return key;
334 -
335 - return null;
 332+ key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1);
 333+ return key; // index everything, even if the target article doesn't exist
336334 }
337335
338336 /** Get number of backlinks to this title */
339337 public int getNumInLinks(String key) throws IOException{
340 - return reader.docFreq(new Term("links",key+"|"));
 338+ ensureRead();
 339+ /*String cacheKey = "getNumInLinks:"+key;
 340+ Object ref = refCache.get(cacheKey);
 341+ if(ref != null)
 342+ return (Integer) ref;
 343+ else{ */
 344+ int r = reader.docFreq(new Term("links",key));
 345+ //refCache.put(cacheKey,r);
 346+ return r;
 347+ //}
341348 }
342349
 350+ @Deprecated
343351 /** Get all article titles that redirect to given title */
344 - public ArrayList<String> getRedirectsTo(String key) throws IOException{
 352+ public ArrayList<String> getRedirectsToOld(String key) throws IOException{
 353+ ensureRead();
345354 ArrayList<String> ret = new ArrayList<String>();
346355 TermDocs td = reader.termDocs(new Term("redirect",key));
347356 while(td.next()){
348 - ret.add(reader.document(td.doc()).get("article_key"));
 357+ ret.add(reader.document(td.doc(),keyOnly).get("article_key"));
349358 }
350359 return ret;
351360 }
352361
353 - protected void ensureRead() throws IOException {
354 - if(state != State.READ)
355 - flushForRead();
 362+ /** Get all article titles that redirect to given title */
 363+ public ArrayList<String> getRedirectsTo(String key) throws IOException{
 364+ ensureRead();
 365+ ArrayList<String> ret = new ArrayList<String>();
 366+ String prefix = key+"|";
 367+ TermEnum te = reader.terms(new Term("redirect",prefix));
 368+ while(te.next()){
 369+ String t = te.term().text();
 370+ if(t.startsWith(prefix)){
 371+ ret.add(t.substring(t.indexOf('|')+1));
 372+ } else
 373+ break;
 374+ }
 375+ return ret;
356376 }
357 -
358377
359378 /** If an article is a redirect
360379 * @throws IOException */
@@ -361,75 +380,43 @@
362381 ensureRead();
363382 TermDocs td = reader.termDocs(new Term("article_key",key));
364383 if(td.next()){
365 - if(reader.document(td.doc()).get("redirect")!=null)
 384+ if(reader.document(td.doc(),redirectOnly).get("redirect")!=null)
366385 return true;
367386 }
368387 return false;
369388 }
370 -
 389+
 390+ @Deprecated
371391 /** If article is redirect, get target, else null */
372 - public String getRedirectTarget(String key) throws IOException{
 392+ public String getRedirectTargetOld(String key) throws IOException{
373393 ensureRead();
374394 TermDocs td = reader.termDocs(new Term("article_key",key));
375395 if(td.next()){
376 - return reader.document(td.doc()).get("redirect");
 396+ return reader.document(td.doc(),redirectOnly).get("redirect");
377397 }
378398 return null;
379399 }
380400
381 - /** Get only anchors without frequency */
382 - public ArrayList<String> getAnchors(String key) throws IOException{
 401+ /** If article is redirect, get target, else null */
 402+ public String getRedirectTarget(String key) throws IOException{
383403 ensureRead();
384 - ArrayList<String> ret = new ArrayList<String>();
385 - TermEnum te = reader.terms(new Term("links",key+"|"));
386 - while(te.next()){
387 - String t = te.term().text();
388 - if(!t.startsWith(key) || !te.term().field().equals("links"))
389 - break;
390 - ret.add(t.substring(key.length()+1));
 404+ TermDocs td = reader.termDocs(new Term("article_key",key));
 405+ if(td.next()){
 406+ String t = reader.document(td.doc(),redirectOnly).get("redirect");
 407+ return t.substring(t.indexOf('|')+1);
391408 }
392 - return ret;
 409+ return null;
393410 }
394 -
395 - /** Get title part of the key (ns:title) */
396 - private String title(String key) {
397 - return key.substring(key.indexOf(':')+1);
398 - }
399411
400 - /** Get anchor texts for given title
401 - * @throws IOException */
402 - public ArrayList<AnchorText> getAnchorText(String key) throws IOException{
403 - ensureRead();
404 - ArrayList<AnchorText> ret = new ArrayList<AnchorText>();
405 - TermEnum te = reader.terms(new Term("links",key+"|"));
406 - while(te.next()){
407 - if(!te.term().text().startsWith(key) || !te.term().field().equals("links"))
408 - break;
409 - ret.add(new AnchorText(te.term().text().substring(key.length()),te.docFreq()));
410 - }
411 - return ret;
412 - }
413412
414 - static public class AnchorText {
415 - public String text; /** ns:title **/
416 - public int freq;
417 - public AnchorText(String text, int freq) {
418 - this.text = text;
419 - this.freq = freq;
420 - }
421 - }
422 -
423 - /** Get all article titles linking to given title
424 - * @throws IOException */
425 - public ArrayList<String> getInLinks(String key, HashMap<Integer,String> keyCache) throws IOException{
 413+ /** Return the namespace of the redirect taget (if any) */
 414+ public int getRedirectTargetNamespace(String key) throws IOException{
426415 ensureRead();
427 - ArrayList<String> ret = new ArrayList<String>();
428 - TermDocs td = reader.termDocs(new Term("links",key+"|"));
429 - while(td.next()){
430 - ret.add(keyCache.get(td.doc()));
431 - //ret.add(reader.document(td.doc()).get("article_key"));
 416+ String t = getRedirectTarget(key);
 417+ if(t != null){
 418+ return Integer.parseInt(t.substring(t.indexOf('|')+1,t.indexOf(':',t.indexOf('|'))));
432419 }
433 - return ret;
 420+ return 0;
434421 }
435422
436423 /** Get all article titles linking to given title
@@ -437,9 +424,11 @@
438425 public ArrayList<CompactArticleLinks> getInLinks(CompactArticleLinks key, HashMap<Integer,CompactArticleLinks> keyCache) throws IOException{
439426 ensureRead();
440427 ArrayList<CompactArticleLinks> ret = new ArrayList<CompactArticleLinks>();
441 - TermDocs td = reader.termDocs(new Term("links",key+"|"));
 428+ TermDocs td = reader.termDocs(new Term("links",key.toString()));
442429 while(td.next()){
443 - ret.add(keyCache.get(td.doc()));
 430+ CompactArticleLinks cs = keyCache.get(td.doc());
 431+ if(cs != null)
 432+ ret.add(cs);
444433 }
445434 return ret;
446435 }
@@ -449,9 +438,9 @@
450439 public ArrayList<String> getInLinks(String key) throws IOException{
451440 ensureRead();
452441 ArrayList<String> ret = new ArrayList<String>();
453 - TermDocs td = reader.termDocs(new Term("links",key+"|"));
 442+ TermDocs td = reader.termDocs(new Term("links",key));
454443 while(td.next()){
455 - ret.add(reader.document(td.doc()).get("article_key"));
 444+ ret.add(reader.document(td.doc(),keyOnly).get("article_key"));
456445 }
457446 return ret;
458447 }
@@ -461,60 +450,77 @@
462451 ensureRead();
463452 TermDocs td = reader.termDocs(new Term("article_key",key));
464453 if(td.next()){
465 - return new StringList(reader.document(td.doc()).get("links_stored"));
 454+ return new StringList(reader.document(td.doc(),linksOnly).get("links"));
466455 }
467456 return null;
468457 }
469458
470 - public Dictionary getKeys() throws IOException{
 459+ /** Get all contexts in which article <i>to<i/> is linked from <i>from</i>.
 460+ * Will return null if there is no context, or link is invalid.
 461+ * @throws ClassNotFoundException */
 462+ @SuppressWarnings("unchecked")
 463+ public ArrayList<String> getContext(String from, String to) throws IOException {
471464 ensureRead();
472 - return new LuceneDictionary(reader,"article_key");
473 - }
474 - @Deprecated
475 - protected void cacheInLinks() throws IOException{
476 - if(state != State.FLUSHED)
477 - flush();
478 - log.info("Caching in-links");
479 - int count = 0;
480 - // docid -> key
481 - HashMap<Integer,String> keyCache = new HashMap<Integer,String>();
482 - Dictionary dict = new LuceneDictionary(reader,"article_key");
483 - Word w;
484 - // build key cache
485 - while((w = dict.next()) != null){
486 - String key = w.getWord();
487 - TermDocs td = reader.termDocs(new Term("article_key",key));
488 - if(td.next()){
489 - keyCache.put(td.doc(),key);
490 - } else
491 - log.error("Cannot find article for key "+key);
 465+ String cacheKey = "getContext:"+from;
 466+ //Element fromCache = cache.get(cacheKey);
 467+ Object fromCache = cache.get(cacheKey);
 468+ if(fromCache != null){
 469+ //HashMap<String, ArrayList<String>> map = (HashMap<String, ArrayList<String>>) fromCache.getObjectValue();
 470+ //HashMap<String, ArrayList<String>> map = (HashMap<String, ArrayList<String>>) fromCache;
 471+ StringMap map = (StringMap) fromCache;
 472+ return map.get(to);
492473 }
493 -
494 - // get inlinks
495 - for(String key : keyCache.values()){
496 - ArrayList<String> in = getInLinks(key,keyCache);
497 - Document doc = new Document();
498 - doc.add(new Field("inlinks_key",key,Field.Store.YES,Field.Index.UN_TOKENIZED));
499 - doc.add(new Field("inlinks",new StringList(in).toString(),Field.Store.YES,Field.Index.UN_TOKENIZED));
500 - writer.addDocument(doc);
501 - count ++;
502 - if(count % 1000 == 0){
503 - System.out.println("Cached inlinks for "+count);
 474+ TermDocs td = reader.termDocs(new Term("article_key",from));
 475+ if(td.next()){
 476+ byte[] serialized = reader.document(td.doc(),contextOnly).getBinaryValue("context");
 477+ if(serialized == null)
 478+ return null;
 479+ StringMap map = new StringMap(serialized);
 480+ try {
 481+ //ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(serialized));
 482+ //HashMap<String, ArrayList<String>> map;
 483+ //map = (HashMap<String, ArrayList<String>>) in.readObject();
 484+ // cache it !
 485+ //cache.put(new Element(cacheKey,map));
 486+ if(from.equals("0:1910") && to.equals("0:April")){
 487+ int b =0;
 488+ b++;
 489+ }
 490+ cache.put(cacheKey,map);
 491+ return map.get(to);
 492+ /* } catch (ClassNotFoundException e) {
 493+ log.error("For getContext("+from+","+to+") got class not found exception: "+e.getMessage());
 494+ e.printStackTrace(); // shouldn't happen! */
 495+ } catch(Exception e){
 496+ e.printStackTrace();
504497 }
 498+
505499 }
 500+
 501+ return null;
506502 }
507503
508 - /** Get all article titles linking to given title (from inlinks cache)
509 - * @throws IOException */
510 - public Collection<String> getInLinksFromCache(String key) throws IOException{
511 - ensureRead();
512 - TermDocs td = reader.termDocs(new Term("inlinks_key",key));
513 - while(td.next()){
514 - return new StringList(reader.document(td.doc()).get("inlinks")).toCollection();
 504+ /** Get all contexts in which article <i>to<i/> is linked from <i>from</i>.
 505+ * Will return null if there is no context, or link is invalid.
 506+ * @throws ClassNotFoundException */
 507+ @SuppressWarnings("unchecked")
 508+ public Collection<String> getContextOld(String from, String to) throws IOException {
 509+ ensureRead();
 510+
 511+ TermDocs td = reader.termDocs(new Term("context_key",to+"|"+from));
 512+ if(td.next()){
 513+ return new StringList(reader.document(td.doc()).get("context")).toCollection();
515514 }
516 - return new ArrayList<String>();
 515+
 516+ return null;
517517 }
518 -
 518+
 519+ /** Get a dictionary of all article keys (ns:title) in this index */
 520+ public Dictionary getKeys() throws IOException{
 521+ ensureRead();
 522+ return new LuceneDictionary(reader,"article_key");
 523+ }
 524+
519525 public Integer getDocId(String key) throws IOException {
520526 TermDocs td = reader.termDocs(new Term("article_key",key));
521527 if(td.next()){
@@ -530,7 +536,18 @@
531537 if(reader != null)
532538 reader.close();
533539 if(directory != null)
534 - directory.close();
535 -
 540+ directory.close();
536541 }
 542+
 543+ public ObjectCache getCache() {
 544+ return cache;
 545+ }
 546+
 547+ /*public ObjectCache getRefCache() {
 548+ return refCache;
 549+ } */
 550+
 551+
 552+
 553+
537554 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/ContextParser.java
@@ -0,0 +1,272 @@
 2+package org.wikimedia.lsearch.ranks;
 3+
 4+import java.util.ArrayList;
 5+import java.util.HashSet;
 6+import java.util.Hashtable;
 7+
 8+import org.wikimedia.lsearch.util.Localization;
 9+
 10+/**
 11+ * Parse wiki-text into sentences. Each sentence will provide a
 12+ * context for links within it.
 13+ *
 14+ * @author rainman
 15+ *
 16+ */
 17+public class ContextParser {
 18+ protected char[] text;
 19+ protected int len;
 20+ protected HashSet<String> imageLocalized = null;
 21+ protected HashSet<String> categoryLocalized = null;
 22+ protected HashSet<String> interwiki = null;
 23+
 24+ protected ArrayList<Context> contexts = null;
 25+ protected int conIn = 0;
 26+
 27+ public static class Context {
 28+ int start;
 29+ int end;
 30+ String context = null;
 31+ public Context(int start, int end) {
 32+ this.start = start;
 33+ this.end = end;
 34+ }
 35+
 36+ public String get(String text){
 37+ if(context == null)
 38+ context = text.substring(start,end);
 39+ return context;
 40+ }
 41+
 42+ }
 43+
 44+ public ContextParser(String text, HashSet<String> imageLocalized, HashSet<String> categoryLocalized, HashSet<String> interwiki){
 45+ this.text = text.toCharArray();
 46+ this.len = this.text.length;
 47+ this.imageLocalized = imageLocalized;
 48+ this.categoryLocalized = categoryLocalized;
 49+ this.interwiki = interwiki;
 50+ parse();
 51+ }
 52+
 53+ /** Get indexes of boundaries of contexts (usually different sentences) */
 54+ public ArrayList<Context> getContexts(){
 55+ return contexts;
 56+ }
 57+
 58+ /** Get context by index in text, function should be called for incrementaly larger index */
 59+ public Context getNext(int index){
 60+ if(conIn >= contexts.size())
 61+ return null;
 62+ Context c = contexts.get(conIn);
 63+ if(c.start > index)
 64+ return null;
 65+ else{
 66+ for(;conIn<contexts.size();conIn++){
 67+ c = contexts.get(conIn);
 68+ if(c.start <= index && index < c.end)
 69+ return c;
 70+ if(c.start > index)
 71+ return null; // no context for this index
 72+ }
 73+ }
 74+ return null;
 75+ }
 76+
 77+ /** fetch up to 128 chars of prefix */
 78+ protected String fetchPrefix(int in){
 79+ int count = 0;
 80+ for(int i=in;i<len;i++,count++){
 81+ if(count >= 128)
 82+ return null;
 83+ if(text[i] == ':'){
 84+ return new String(text,in,count);
 85+ }
 86+ }
 87+ return null;
 88+ }
 89+
 90+ protected void parse(){
 91+ if(contexts != null)
 92+ return;
 93+ contexts = new ArrayList<Context>();
 94+ int cur = 0;
 95+ char c;
 96+ boolean seenLetter = false;
 97+ int topLinkLevel = 0;
 98+ boolean inQuotes = false;
 99+ int start = 0;
 100+ for(;cur<len;cur++){
 101+ c = text[cur];
 102+ if(!seenLetter && Character.isLetterOrDigit(c))
 103+ seenLetter = true;
 104+ switch(c){
 105+ case '[':
 106+ if(cur+2>=len)
 107+ continue; // EOF
 108+ if(text[cur+1]=='['){
 109+ boolean valid = false;
 110+ int closingInx = -1;
 111+ // seek to see if this is valid link opening
 112+ for(int i=cur+2;i<len && i<cur+512;i++){
 113+ if(text[i]=='[' && i+1<len && text[i+1]=='[')
 114+ break; // bad internal link
 115+ if(text[i]==']' && i+1<len && text[i+1]==']'){
 116+ topLinkLevel++; // ok, valid internal link
 117+ closingInx = i+2;
 118+ valid = true;
 119+ break;
 120+ }
 121+
 122+ }
 123+ // begin of links
 124+ String prefix = fetchPrefix(cur+2);
 125+ if(prefix != null && isImage(prefix)){
 126+ // take full image caption as one context
 127+ int lastPipe = cur + 2 + prefix.length();
 128+ int linkLevel = 0;
 129+ int imageEnd = -1;
 130+ for(int i=lastPipe;i<len;i++){
 131+ if(text[i]=='|')
 132+ lastPipe = i;
 133+ // internal link begin
 134+ if(text[i]=='[' && i+1<len && text[i+1]=='[')
 135+ linkLevel++;
 136+ // internal link end
 137+ if(text[i]==']' && i+1<len && text[i+1]==']'){
 138+ if(linkLevel == 0){
 139+ imageEnd = i+1;
 140+ break;
 141+ } else if(linkLevel != 0)
 142+ linkLevel--;
 143+ }
 144+ }
 145+ // add everything up to image as one context
 146+ // and image caption as second context
 147+ if(imageEnd != -1){
 148+ contexts.add(new Context(lastPipe+1,imageEnd-2));
 149+ start = imageEnd+1;
 150+ cur = imageEnd;
 151+ }
 152+ } else if(valid && prefix != null && (isCategory(prefix) || isInterwiki(prefix))){
 153+ // skip categories
 154+ if(seenLetter)
 155+ contexts.add(new Context(start,cur));
 156+ start = cur;
 157+ cur = closingInx;
 158+ }
 159+ }
 160+ break;
 161+ case 'h': case 'f':
 162+ // check simple http/ftp links
 163+ if(checkPrefix(cur,"http://") || checkPrefix(cur,"ftp://")){
 164+ if(seenLetter && cur-start>2)
 165+ contexts.add(new Context(start,cur-1));
 166+ for(;cur<len;cur++){
 167+ if(text[cur]==' ' || text[cur]==']'){ // seek to after link
 168+ start = cur+1;
 169+ seenLetter = false;
 170+ break;
 171+ }
 172+ }
 173+ }
 174+ break;
 175+ case '<':
 176+ if(checkPrefix(cur,"<tr>") || checkPrefix(cur,"</tr>")){
 177+ if(seenLetter)
 178+ contexts.add(new Context(start,cur-1));
 179+ start = cur + 4;
 180+ }
 181+ break;
 182+ case ']':
 183+ if(cur+2>=len)
 184+ continue; // EOF
 185+ if(text[cur+1]==']' && topLinkLevel!=0){
 186+ topLinkLevel--;
 187+ }
 188+ break;
 189+ case '"':
 190+ // numbers like 6'5"
 191+ if(cur>0 && Character.isDigit(text[cur-1]))
 192+ break;
 193+ inQuotes = !inQuotes;
 194+ break;
 195+ case '=':
 196+ case '!':
 197+ case '?':
 198+ case '{':
 199+ case '}':
 200+ case '*':
 201+ case '#':
 202+ case '|':
 203+ case '.':
 204+ case '\n':
 205+ // whole quote and link text is context
 206+ //if(inQuotes || topLinkLevel!=0)
 207+ // break;
 208+ // only double == is separator (as in headings)
 209+ if(c == '=' && !(cur+1<len && text[cur+1]=='='))
 210+ break;
 211+ // | is separator in tables, etc.. but not in link syntax like [[x|y]]
 212+ if(c == '|' && topLinkLevel != 0 && (cur+1<len && text[cur+1]!='-'))
 213+ break;
 214+ // dot/comma between numbers
 215+ if((c == '.' || c==',') && (cur>0 && Character.isDigit(text[cur-1]) && cur+1<len && Character.isDigit(text[cur+1])))
 216+ break;
 217+ // proceed only if this is not paragraph brake (i.e. \n\n)
 218+ if(c == '\n' && !(cur+1<len && (text[cur+1]=='\n' || text[cur+1]==':')))
 219+ break;
 220+
 221+ if(seenLetter){
 222+ contexts.add(new Context(start,cur));
 223+ start = cur + 1;
 224+ seenLetter = false;
 225+ }
 226+ break;
 227+ }
 228+ }
 229+ if(seenLetter)
 230+ contexts.add(new Context(start,len));
 231+ }
 232+
 233+ /** check text from cur position */
 234+ private boolean checkPrefix(int cur, String prefix) {
 235+ if(cur + prefix.length() < len){
 236+ for(int i=0;i<prefix.length();i++){
 237+ if(text[cur+i] != prefix.charAt(i))
 238+ return false;
 239+ }
 240+ return true;
 241+ }
 242+ return false;
 243+ }
 244+
 245+ /** Check if this is an "image" keyword using localization */
 246+ private final boolean isImage(String prefix){
 247+ prefix = prefix.toLowerCase();
 248+ if(prefix.equals("image"))
 249+ return true;
 250+ if(imageLocalized!=null && imageLocalized.contains(prefix))
 251+ return true;
 252+ return false;
 253+ }
 254+
 255+ private final boolean isCategory(String prefix){
 256+ prefix = prefix.toLowerCase();
 257+ if(prefix.equals("category"))
 258+ return true;
 259+ if(categoryLocalized!=null && categoryLocalized.contains(prefix))
 260+ return true;
 261+ return false;
 262+ }
 263+
 264+ private final boolean isInterwiki(String prefix){
 265+ if(interwiki!=null)
 266+ return interwiki.contains(prefix);
 267+ else
 268+ return false;
 269+ }
 270+
 271+
 272+
 273+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RankBuilder.java
@@ -56,7 +56,6 @@
5757 public static void main(String[] args) throws IOException {
5858 String inputfile = null;
5959 String dbname = null;
60 - boolean useExistingTemp = false;
6160
6261 System.out.println("MediaWiki Lucene search indexer - build rank info from xml dumps.\n");
6362
@@ -64,15 +63,11 @@
6564 log = Logger.getLogger(RankBuilder.class);
6665
6766 if(args.length < 2){
68 - System.out.println("Syntax: java RankBuilder [-t] <inputfile> <dbname>");
69 - System.out.println("Options:");
70 - System.out.println(" -t - use existing temporary ranking index");
 67+ System.out.println("Syntax: java RankBuilder <inputfile> <dbname>");
7168 return;
7269 }
7370 for(int i=0;i<args.length;i++){
74 - if(args[i].equals("-t"))
75 - useExistingTemp = true;
76 - else if(inputfile == null)
 71+ if(inputfile == null)
7772 inputfile = args[i];
7873 else if(dbname == null)
7974 dbname = args[i];
@@ -92,59 +87,22 @@
9388 long start = System.currentTimeMillis();
9489
9590 // link info
96 - Links links = null;
97 - if(useExistingTemp)
98 - links = Links.openExisting(iid);
99 - else
100 - links = processLinks(inputfile,getTitles(inputfile,langCode,iid),langCode);
101 - //links.cacheInLinks();
102 - /*log.info("Creating ref count cache");
103 - HashMap<Integer,Integer> refCache = new HashMap<Integer,Integer>();
104 - HashMap<Integer,String> keyCache = new HashMap<Integer,String>();
105 - HashMap<String,Integer> docIdCache = new HashMap<String,Integer>();
106 - Word w; Dictionary d = links.getKeys();
107 - while((w = d.next()) != null){
108 - String key = w.getWord();
109 - int docid = links.getDocId(key);
110 - refCache.put(docid,links.getNumInLinks(key));
111 - keyCache.put(docid,key);
112 - docIdCache.put(key,docid);
 91+ Links links = Links.createNew(iid);
 92+ try{
 93+ processLinks(inputfile,links,iid,langCode);
 94+ } catch(IOException e){
 95+ log.fatal("I/O error processing "+inputfile+" : "+e.getMessage());
 96+ e.printStackTrace();
11397 }
114 - log.info("Caching in/out links");
115 - HashMap<Integer,int[]> outLinkCache = new HashMap<Integer,int[]>();
116 - HashMap<Integer,int[]> inLinkCache = new HashMap<Integer,int[]>();
117 - // cache in/out links
118 - d = links.getKeys();
119 - while((w = d.next()) != null){
120 - String key = w.getWord();
121 - int docid = docIdCache.get(key);
122 - Collection<String> in = links.getInLinks(key,keyCache);
123 - int[] inset = new int[in.size()];
124 - int i=0;
125 - for(String k : in)
126 - inset[i++] = docIdCache.get(k);
127 - inLinkCache.put(docid,inset);
128 -
129 - Collection<String> out = links.getOutLinks(key).toCollection();
130 - int[] outset = new int[out.size()];
131 - i = 0;
132 - for(String k : out){
133 - outset[i++] = docIdCache.get(k);
134 - }
135 - outLinkCache.put(docid,outset);
136 - }
137 - storeLinkAnalysis(links,iid,docIdCache,keyCache,refCache,inLinkCache,outLinkCache); */
138 - storeLinkAnalysis(links,iid);
139 - //Storage store = Storage.getInstance();
140 - //store.storePageReferences(links.getAll(),dbname);
141 - //storeRelated(store,links,dbname);
142 -
 98+
 99+ IndexThread.makeIndexSnapshot(iid.getLinks(),iid.getLinks().getImportPath());
 100+
143101 long end = System.currentTimeMillis();
144102
145103 System.out.println("Finished generating ranks in "+formatTime(end-start));
146104 }
147105
148 - //public static void storeLinkAnalysis(Links links, IndexId iid, HashMap<String, Integer> docIdCache, HashMap<Integer, String> keyCache, HashMap<Integer, Integer> refCache, HashMap<Integer, int[]> inLinkCache, HashMap<Integer, int[]> outLinkCache) throws IOException{
 106+ @Deprecated
149107 public static void storeLinkAnalysis(Links links, IndexId iid) throws IOException{
150108 log.info("Storing link analysis data");
151109 LinkAnalysisStorage store = new LinkAnalysisStorage(iid);
@@ -154,7 +112,7 @@
155113 String key = w.getWord();
156114 int ref = links.getNumInLinks(key);
157115 String redirectTarget = links.getRedirectTarget(key);
158 - ArrayList<String> anchor = links.getAnchors(key);
 116+ ArrayList<String> anchor = null; //links.getAnchors(key);
159117 ArrayList<Related> related = new ArrayList<Related>(); //FIXME: too slow getRelated(key,links,refCount,keyCache);
160118 //ArrayList<Related> related = getRelated(key,links,docIdCache,keyCache,refCache,inLinkCache,outLinkCache);
161119 ArrayList<String> redirect = links.getRedirectsTo(key);
@@ -164,53 +122,16 @@
165123
166124 }
167125
168 - public static Links processLinks(String inputfile, Links links, String langCode) {
169 - log.info("Second pass, calculating article links...");
170 - InputStream input = null;
171 - // second pass - calculate page ranks
172 - try {
173 - input = Tools.openInputFile(inputfile);
174 - } catch (IOException e) {
175 - log.fatal("I/O error opening "+inputfile+" : "+e.getMessage());
176 - return null;
177 - }
 126+ public static Links processLinks(String inputfile, Links links, IndexId iid, String langCode) throws IOException {
 127+ log.info("Calculating article links...");
 128+ InputStream input = Tools.openInputFile(inputfile);
178129 // calculate ranks
179 - LinkReader rr = new LinkReader(links,langCode);
 130+ LinkReader rr = new LinkReader(links,iid,langCode);
180131 XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000));
181 - try {
182 - reader.readDump();
183 - links.flush();
184 - } catch (IOException e) {
185 - log.fatal("I/O error reading dump while calculating ranks for from "+inputfile+" : "+e.getMessage());
186 - return null;
187 - }
 132+ reader.readDump();
 133+ links.flush();
188134 return links;
189135 }
190 -
191 - public static Links getTitles(String inputfile,String langCode,IndexId iid) {
192 - log.info("First pass, getting a list of valid articles...");
193 - InputStream input = null;
194 - try {
195 - input = Tools.openInputFile(inputfile);
196 - } catch (IOException e) {
197 - log.fatal("I/O error opening "+inputfile+" : "+e.getMessage());
198 - return null;
199 - }
200 - try {
201 - // first pass, get titles
202 - TitleReader tr = new TitleReader(langCode,iid);
203 - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
204 - reader.readDump();
205 - input.close();
206 - Links links = tr.getLinks();
207 - links.flush();
208 - return links;
209 - } catch (IOException e) {
210 - log.fatal("I/O error reading dump while getting titles from "+inputfile+" : "+e.getMessage());
211 - return null;
212 - }
213 -
214 - }
215136
216137 /**
217138 * Get related articles, sorted descending by score
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/StringMap.java
@@ -0,0 +1,198 @@
 2+package org.wikimedia.lsearch.ranks;
 3+
 4+import java.io.ByteArrayInputStream;
 5+import java.io.ByteArrayOutputStream;
 6+import java.io.DataInputStream;
 7+import java.io.DataOutputStream;
 8+import java.io.EOFException;
 9+import java.io.IOException;
 10+import java.io.UnsupportedEncodingException;
 11+import java.nio.ByteBuffer;
 12+import java.util.ArrayList;
 13+import java.util.Collections;
 14+import java.util.Comparator;
 15+import java.util.HashMap;
 16+import java.util.HashSet;
 17+import java.util.Map.Entry;
 18+
 19+public class StringMap {
 20+ protected static final int BUFFER_SIZE = 300;
 21+ protected char[] buf = new char[BUFFER_SIZE];
 22+ protected int len = 0, pos = 0;
 23+ protected HashMap<String,ArrayList<String>> map = null;
 24+ protected HashMap<Integer,ArrayList<Integer>> hashMap = null;
 25+ protected byte[] serialized = null;
 26+ public static final char DELIMITER = '\0';
 27+ protected final int INT_SIZE = Integer.SIZE / 8;
 28+
 29+ public StringMap(HashMap<String,ArrayList<String>> map){
 30+ this.map = map;
 31+ }
 32+
 33+ public StringMap(byte[] serialized) throws IOException{
 34+ this.serialized = serialized;
 35+ readHash();
 36+ }
 37+
 38+ /** initialize the small hashmap at the beggining of the stream */
 39+ private void readHash() throws IOException {
 40+ hashMap = new HashMap<Integer,ArrayList<Integer>>();
 41+ ByteArrayInputStream ba = new ByteArrayInputStream(serialized);
 42+ DataInputStream di = new DataInputStream(ba);
 43+ int size = di.readInt();
 44+ for(int i=0;i<size;i++){
 45+ int hash = di.readInt();
 46+ ArrayList<Integer> pos = hashMap.get(hash);
 47+ if(pos == null){
 48+ pos = new ArrayList<Integer>();
 49+ hashMap.put(hash,pos);
 50+ }
 51+ pos.add(di.readInt());
 52+ }
 53+ }
 54+
 55+ protected int encLen(String str) throws UnsupportedEncodingException{
 56+ return str.getBytes("utf-8").length;
 57+ }
 58+
 59+ public byte[] serialize() throws IOException{
 60+ if(serialized != null)
 61+ return serialized;
 62+ // unique string, string -> index (within string segment)
 63+ HashMap<String,Integer> strings = new HashMap<String,Integer>();
 64+ // hash -> list of keys
 65+ HashMap<Integer,ArrayList<String>> hashs = new HashMap<Integer,ArrayList<String>>();
 66+ // contexts, key -> index of string (from strings)
 67+ HashMap<String,ArrayList<Integer>> contexts = new HashMap<String,ArrayList<Integer>>();
 68+ // keys in some order
 69+ ArrayList<String> keys = new ArrayList<String>();
 70+ keys.addAll(map.keySet());
 71+ int offset = 0;
 72+ for(String key : keys){
 73+ // mapping hash -> keys
 74+ int hash = key.hashCode();
 75+ ArrayList<String> hk = hashs.get(hash);
 76+ if(hk == null){
 77+ hk = new ArrayList<String>();
 78+ hashs.put(hash,hk);
 79+ }
 80+ hk.add(key);
 81+ // contexts
 82+ ArrayList<Integer> cc = new ArrayList<Integer>();
 83+ contexts.put(key,cc);
 84+ for(String s : map.get(key)){
 85+ // identifier
 86+ Integer i = strings.get(s);
 87+ if(i == null){
 88+ i = offset;
 89+ strings.put(s,i);
 90+ offset += encLen(s) + INT_SIZE;
 91+ }
 92+ cc.add(i);
 93+ }
 94+ }
 95+ int keyOffset = INT_SIZE+2*INT_SIZE*map.size();
 96+ int stringOffset = keyOffset;
 97+ // key -> offset
 98+ HashMap<String,Integer> keyOffsets = new HashMap<String,Integer>();
 99+ for(String key : keys){
 100+ keyOffsets.put(key,stringOffset);
 101+ stringOffset += INT_SIZE+encLen(key)+INT_SIZE+contexts.get(key).size()*INT_SIZE;
 102+ }
 103+ // serialize!
 104+ ByteArrayOutputStream ba = new ByteArrayOutputStream();
 105+ DataOutputStream ds = new DataOutputStream(ba);
 106+ ds.writeInt(hashs.size());
 107+ // write out the hashmap
 108+ ArrayList<Entry<Integer,ArrayList<String>>> sortedHash = new ArrayList<Entry<Integer,ArrayList<String>>>();
 109+ sortedHash.addAll(hashs.entrySet());
 110+ Collections.sort(sortedHash,new Comparator<Entry<Integer,ArrayList<String>>>(){
 111+ public int compare(Entry<Integer, ArrayList<String>> o1, Entry<Integer, ArrayList<String>> o2) {
 112+ return o1.getKey() - o2.getKey();
 113+ }
 114+ });
 115+ // write pairs [ hash] [ position of key ]
 116+ for(Entry<Integer,ArrayList<String>> e : sortedHash){
 117+ int hash = e.getKey();
 118+ for(String key : e.getValue()){
 119+ ds.writeInt(hash);
 120+ ds.writeInt(keyOffsets.get(key));
 121+ }
 122+ }
 123+ // write: [ key.length ] [ key ] [context1_pos] [context2_pos] ...
 124+ for(String key : keys){
 125+ byte[] b = key.getBytes("utf-8");
 126+ ds.writeInt(b.length);
 127+ ds.write(b);
 128+ ArrayList<Integer> con = contexts.get(key);
 129+ if(con == null || con.size()==0)
 130+ ds.writeInt(0);
 131+ else{
 132+ ds.writeInt(con.size());
 133+ for(Integer index : con){
 134+ ds.writeInt(stringOffset+index);
 135+ }
 136+ }
 137+ }
 138+ // write string as [size] [string]
 139+ HashSet<String> written = new HashSet<String>();
 140+ for(String key : keys){
 141+ for(String c : map.get(key)){
 142+ if(written.contains(c))
 143+ continue;
 144+ byte[] b = c.getBytes("utf-8");
 145+ ds.writeInt(b.length);
 146+ ds.write(b);
 147+ written.add(c);
 148+ }
 149+ }
 150+ serialized = ba.toByteArray();
 151+ return serialized;
 152+ }
 153+
 154+ private final int read(){
 155+ return serialized[pos++] & 0xff;
 156+ }
 157+
 158+ protected int readInt() throws IOException {
 159+ int ch1 = read();
 160+ int ch2 = read();
 161+ int ch3 = read();
 162+ int ch4 = read();
 163+ if ((ch1 | ch2 | ch3 | ch4) < 0)
 164+ throw new EOFException();
 165+ return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
 166+ }
 167+
 168+ protected String readString() throws IOException{
 169+ int len = readInt();
 170+ int start = pos;
 171+ pos+=len;
 172+ return new String(serialized,start,len,"utf-8");
 173+ }
 174+
 175+ /** Get an array of string for a key
 176+ * @throws IOException */
 177+ public ArrayList<String> get(String key) throws IOException{
 178+ ArrayList<String> ret = new ArrayList<String>();
 179+ if(!hashMap.containsKey(key.hashCode()))
 180+ return ret;
 181+ for(Integer p : hashMap.get(key.hashCode())){
 182+ pos = p;
 183+ String k = readString();
 184+ if(key.equals(k)){
 185+ // found key, read context
 186+ int num = readInt();
 187+ int[] strings = new int[num];
 188+ for(int i=0;i<num;i++){
 189+ strings[i] = readInt();
 190+ }
 191+ for(int strpos : strings){
 192+ pos = strpos;
 193+ ret.add(readString());
 194+ }
 195+ }
 196+ }
 197+ return ret;
 198+ }
 199+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/ObjectCache.java
@@ -0,0 +1,67 @@
 2+package org.wikimedia.lsearch.ranks;
 3+
 4+import java.util.HashMap;
 5+import java.util.LinkedList;
 6+import java.util.WeakHashMap;
 7+
 8+/**
 9+ * Maintain a cache of objects. Cache is a simple FIFO cache of
 10+ * constant size. Oldest entries get replaced by newer ones.
 11+ *
 12+ * @author rainman
 13+ *
 14+ */
 15+public class ObjectCache {
 16+ /** used to maintain FIFO cache of valid keys */
 17+ protected String[] fifo;
 18+ /** storage of objects */
 19+ protected HashMap<String,Object> objs = new HashMap<String,Object>();
 20+ protected int size, inx;
 21+
 22+ protected long hits = 0;
 23+ protected long miss = 0;
 24+
 25+ protected int report = 0;
 26+
 27+ public ObjectCache(int size){
 28+ this.size = size;
 29+ this.fifo = new String[size];
 30+ this.inx = 0;
 31+ }
 32+
 33+ public void put(String key, Object obj){
 34+ // add to FIFO queue only if not already in it
 35+ if(!objs.containsKey(key)){
 36+ if(inx >= size)
 37+ inx = 0;
 38+ String del = fifo[inx];
 39+ if(del != null){
 40+ //remove oldest from cache
 41+ objs.remove(del);
 42+ }
 43+ fifo[inx] = key; // latest cached key
 44+ inx++;
 45+ }
 46+ objs.put(key,obj);
 47+ }
 48+
 49+ public Object get(String key){
 50+ if(++report >= 5000){
 51+ report = 0;
 52+ System.out.println(getStats());
 53+ }
 54+ Object obj = objs.get(key);
 55+ if(obj !=null )
 56+ hits++;
 57+ else
 58+ miss++;
 59+ return obj;
 60+ }
 61+
 62+ public String getStats(){
 63+ long total = hits+miss;
 64+ return "HITS: "+hits+" ("+((float)hits*100/total)+"%), MISS: "+miss+" ("+((float)miss*100/total)+"%)";
 65+ }
 66+
 67+
 68+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java
@@ -30,6 +30,7 @@
3131 import org.apache.lucene.analysis.Analyzer;
3232 import org.apache.lucene.analysis.TokenStream;
3333 import org.wikimedia.lsearch.analyzers.WikiTokenizer;
 34+import org.wikimedia.lsearch.config.IndexId;
3435
3536 /**
3637 * @author Kate Turner
@@ -58,6 +59,6 @@
5960 if(streams.get(fieldName) != null)
6061 return streams.get(fieldName);
6162
62 - return new AliasPorterStemFilter(new WikiTokenizer(text,"en",false));
 63+ return new AliasPorterStemFilter(new WikiTokenizer(text,IndexId.get("enwiki"),false));
6364 }
6465 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
@@ -18,6 +18,7 @@
1919 import org.wikimedia.lsearch.analyzers.WikiQueryParser.NamespacePolicy;
2020 import org.wikimedia.lsearch.config.Configuration;
2121 import org.wikimedia.lsearch.config.GlobalConfiguration;
 22+import org.wikimedia.lsearch.config.IndexId;
2223 import org.wikimedia.lsearch.index.WikiIndexModifier;
2324 import org.wikimedia.lsearch.search.NamespaceFilter;
2425
@@ -40,7 +41,7 @@
4142 WikiQueryParser.KEYWORD_BOOST = 0.05f;
4243 WikiQueryParser.ADD_TITLE_PHRASES = false;
4344 WikiIndexModifier.ALT_TITLES = 3;
44 - FieldBuilder.BuilderSet bs = new FieldBuilder("").getBuilder();
 45+ FieldBuilder.BuilderSet bs = new FieldBuilder(IndexId.get("enwiki")).getBuilder();
4546 FieldNameFactory ff = new FieldNameFactory();
4647 try{
4748 WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),new SimpleAnalyzer(),bs,null);
@@ -126,9 +127,10 @@
127128 // extraction of phrases
128129 ArrayList<String> stopWords = new ArrayList<String>();
129130 stopWords.add("the"); stopWords.add("who");
130 - stopWords.add("is"); stopWords.add("a");
131 - Analyzer analyzer = Analyzers.getSearcherAnalyzer("en");
132 - bs = new FieldBuilder("en").getBuilder();
 131+ stopWords.add("is"); stopWords.add("a");
 132+ IndexId enwiki = IndexId.get("enwiki");
 133+ Analyzer analyzer = Analyzers.getSearcherAnalyzer(enwiki);
 134+ bs = new FieldBuilder(enwiki).getBuilder();
133135 parser = new WikiQueryParser(bs.getFields().title(),"0",analyzer,bs,NamespacePolicy.IGNORE,stopWords);
134136 assertEquals("[how, do, you, do]",parser.extractWords(parser.parseRaw("how do you do")).toString());
135137 assertEquals("[making, something, rest]",parser.extractWords(parser.parseRaw("(help:making something incategory:blah) OR (rest incategory:crest)")).toString());
@@ -230,8 +232,8 @@
231233 // ==================================
232234 // Tests with actual params :)
233235 // ==================================
234 - analyzer = Analyzers.getSearcherAnalyzer("en");
235 - bs = new FieldBuilder("en").getBuilder();
 236+ analyzer = Analyzers.getSearcherAnalyzer(enwiki);
 237+ bs = new FieldBuilder(enwiki).getBuilder();
236238 parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE);
237239 WikiQueryParser.ADD_STEM_TITLE = false;
238240 WikiQueryParser.STEM_TITLE_BOOST = 0;
@@ -354,8 +356,8 @@
355357 assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(contents:godel contents:goedel)) (+title:something^2.0 +title:for^2.0 +(title:godel^2.0 title:goedel^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godel^6.0 alttitle1:goedel^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godel^6.0 alttitle2:goedel^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godel^6.0 alttitle3:goedel^6.0)))",q.toString());
356358
357359 // Backward compatiblity for complex filters
358 - analyzer = Analyzers.getSearcherAnalyzer("en");
359 - bs = new FieldBuilder("en").getBuilder();
 360+ analyzer = Analyzers.getSearcherAnalyzer(enwiki);
 361+ bs = new FieldBuilder(enwiki).getBuilder();
360362 parser = new WikiQueryParser(bs.getFields().contents(),"0,1,4,12",analyzer,bs,NamespacePolicy.IGNORE);
361363
362364 q = parser.parseTwoPass("beans everyone",NamespacePolicy.REWRITE);
@@ -381,15 +383,15 @@
382384 assertEquals("[(many,1,5), (more,7,11), (has,16,19), (some,23,27), (g,29,30)]",t.toString());
383385
384386 // German
385 - analyzer = Analyzers.getSearcherAnalyzer("de");
386 - bs = new FieldBuilder("de").getBuilder();
 387+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("dewiki"));
 388+ bs = new FieldBuilder(IndexId.get("dewiki")).getBuilder();
387389 parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE);
388390 q = parser.parseTwoPass("welche rolle spielen Mineralstoffe in der Ernährung?",NamespacePolicy.IGNORE);
389391 assertEquals("(+(contents:welche contents:welch^0.5) +(contents:rolle contents:roll^0.5) +(contents:spielen contents:spiel^0.5) +(contents:mineralstoffe contents:mineralstoff^0.5) +contents:in +contents:der +(+(contents:ernahrung contents:ernahr^0.5) (contents:ernaehrung contents:ernaehr^0.5))) (+title:welche^2.0 +title:rolle^2.0 +title:spielen^2.0 +title:mineralstoffe^2.0 +title:in^2.0 +title:der^2.0 +(title:ernahrung^2.0 title:ernaehrung^2.0))",q.toString());
390392
391393 // CJK
392 - analyzer = Analyzers.getSearcherAnalyzer("ja");
393 - bs = new FieldBuilder("ja").getBuilder();
 394+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("jawiki"));
 395+ bs = new FieldBuilder(IndexId.get("jawiki")).getBuilder();
394396 parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE);
395397 q = parser.parseFourPass("うろパン",NamespacePolicy.IGNORE,false);
396398 assertEquals("contents:\"うろ ろハ ハン\" title:\"うろ ろハ ハン\"^2.0 (alttitle1:\"うろ ろハ ハン\"^6.0 alttitle2:\"うろ ろハ ハン\"^6.0 alttitle3:\"うろ ろハ ハン\"^6.0)",q.toString());
@@ -402,8 +404,8 @@
403405
404406
405407 // Malayalam
406 - analyzer = Analyzers.getSearcherAnalyzer("ml");
407 - bs = new FieldBuilder("ml").getBuilder();
 408+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("mlwiki"));
 409+ bs = new FieldBuilder(IndexId.get("mlwiki")).getBuilder();
408410 parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE);
409411 q = parser.parseFourPass("കൊറിയ ",NamespacePolicy.IGNORE,false);
410412 assertEquals("contents:കറയ title:കറയ^2.0 (alttitle1:കറയ^6.0 alttitle2:കറയ^6.0 alttitle3:കറയ^6.0)",q.toString());
@@ -420,8 +422,8 @@
421423 WikiQueryParser.STEM_TITLE_BOOST = 1;
422424
423425 // Localization tests
424 - analyzer = Analyzers.getSearcherAnalyzer("sr");
425 - bs = new FieldBuilder("sr").getBuilder();
 426+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("srwiki"));
 427+ bs = new FieldBuilder(IndexId.get("srwiki")).getBuilder();
426428 parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE);
427429
428430 q = parser.parseTwoPass("all:добродошли на википедију",NamespacePolicy.IGNORE);
@@ -430,8 +432,8 @@
431433 q = parser.parseTwoPass("all:dobrodošli na šđčćž",NamespacePolicy.IGNORE);
432434 assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^3.0 +title:na^3.0 +title:sdjccz^3.0)",q.toString());
433435
434 - analyzer = Analyzers.getSearcherAnalyzer("th");
435 - bs = new FieldBuilder("th").getBuilder();
 436+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("thwiki"));
 437+ bs = new FieldBuilder(IndexId.get("thwiki")).getBuilder();
436438 parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE);
437439
438440 q = parser.parseTwoPass("ภาษาไทย",NamespacePolicy.IGNORE);
@@ -441,8 +443,8 @@
442444 assertEquals("(+namespace:12 +(+contents:ภาษา +contents:ไทย)) (+namespace:12 +(+title:ภาษา^3.0 +title:ไทย^3.0))",q.toString());
443445
444446 // vietnamese
445 - analyzer = Analyzers.getSearcherAnalyzer("vi");
446 - bs = new FieldBuilder("vi").getBuilder();
 447+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("viwiki"));
 448+ bs = new FieldBuilder(IndexId.get("viwiki")).getBuilder();
447449 parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE);
448450
449451 q = parser.parseTwoPass("Gánh nước đêm trăng",NamespacePolicy.IGNORE);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
@@ -11,11 +11,12 @@
1212 import org.apache.lucene.analysis.Token;
1313 import org.apache.lucene.analysis.TokenStream;
1414 import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
 15+import org.wikimedia.lsearch.config.IndexId;
1516 import org.wikimedia.lsearch.index.WikiIndexModifier;
1617
1718 public class FastWikiTokenizerTest {
1819 public static void displayTokensForParser(String text) {
19 - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"en",false);
 20+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false);
2021 Token[] tokens = parser.parse().toArray(new Token[] {});
2122 for (int i = 0; i < tokens.length; i++) {
2223 Token token = tokens[i];
@@ -116,7 +117,7 @@
117118 for(int i=0;i<2000;i++){
118119 for(TestArticle article : articles){
119120 String text = article.content;
120 - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"en",false);
 121+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false);
121122 parser.parse();
122123 }
123124 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java
@@ -94,6 +94,8 @@
9595 {"cource", "course"},
9696 {"carolene products",""},
9797 {"orvileWright","overnight"},
 98+ {"livia tremor control","olivia tremor control"},
 99+ {"ommmited","omitted"},
98100
99101 };
100102
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
@@ -86,6 +86,10 @@
8787 sendOutputLine("#no suggestion");
8888 for(ResultSet rs : res.getResults()){
8989 sendResultLine(rs.score, rs.namespace, rs.title);
 90+ if(rs.getContext() != null){
 91+ for(String c : rs.getContext())
 92+ sendOutputLine("#context "+c);
 93+ }
9094 if(rs.getExplanation() != null)
9195 sendOutputLine(rs.getExplanation().toString());
9296 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/HttpHandler.java
@@ -41,7 +41,7 @@
4242 protected String postData;
4343
4444 protected final int BUF_SIZE = 8192;
45 - protected final char[] outputBuffer = new char[BUF_SIZE];
 45+ protected char[] outputBuffer = new char[BUF_SIZE];
4646 protected int bufLength = 0;
4747
4848 protected int minorVersion; // the x in HTTP 1.x
@@ -227,9 +227,12 @@
228228 log.debug(">>>"+sout);
229229 // write to buffer instead directly to stream!
230230 char[] s = (sout+"\r\n").toCharArray();
231 - if(bufLength + s.length >= BUF_SIZE)
 231+ if(bufLength + s.length >= outputBuffer.length)
232232 flushOutput();
233 - // FIXME: what if array is 2x larger than buffer?
 233+ // extend buffer if needed
 234+ if(s.length > bufLength){
 235+ outputBuffer = new char[s.length*2];
 236+ }
234237 System.arraycopy(s,0,outputBuffer,bufLength,s.length);
235238 bufLength+=s.length;
236239 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java
@@ -46,7 +46,7 @@
4747 Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,false);
4848 NamespaceFilter nsDefault = new NamespaceFilter("0"); // default to main namespace
4949 FieldBuilder.Case dCase = FieldBuilder.Case.IGNORE_CASE;
50 - FieldBuilder.BuilderSet bs = new FieldBuilder(global.getLanguage(iid.getDBname()),dCase).getBuilder(dCase);
 50+ FieldBuilder.BuilderSet bs = new FieldBuilder(iid,dCase).getBuilder(dCase);
5151 WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),nsDefault,analyzer,bs,WikiQueryParser.NamespacePolicy.IGNORE,null);
5252 while(true){
5353 System.out.print(">> ");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java
@@ -42,7 +42,7 @@
4343 public CleanIndexWriter(IndexId iid) throws IOException{
4444 GlobalConfiguration global = GlobalConfiguration.getInstance();
4545 this.iid = iid;
46 - this.builder = new FieldBuilder("",FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK);
 46+ this.builder = new FieldBuilder(iid,FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK);
4747 this.langCode = global.getLanguage(iid.getDBname());
4848 HashSet<String> stopWords = new HashSet<String>();
4949 for(String w : StopWords.getStopWords(iid,langCode))
@@ -90,7 +90,7 @@
9191 if(!WikiIndexModifier.checkAddPreconditions(a,langCode))
9292 return; // don't add if preconditions are not met
9393
94 - Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid);
 94+ Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid,null);
9595 Document doc = (Document) ret[0];
9696 Analyzer analyzer = (Analyzer) ret[1];
9797 try {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java
@@ -176,13 +176,13 @@
177177 }
178178
179179 /**
180 - * Register a title in the index, without tokenization, just lowercase.
 180+ * Register a title in the index, without tokenization, strip of accents and such.
181181 *
182182 * @param title
183183 */
184184 public void addTitle(String title){
185185 Document doc = new Document();
186 - doc.add(new Field("title", title.toLowerCase(), Field.Store.NO, Field.Index.UN_TOKENIZED));
 186+ doc.add(new Field("title", FastWikiTokenizerEngine.stipTitle(title.toLowerCase()), Field.Store.NO, Field.Index.UN_TOKENIZED));
187187 ngramWriter.addDocument(doc);
188188 }
189189 /**
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/CompactRelated.java
@@ -36,7 +36,7 @@
3737 this.title = title;
3838 }
3939 public String serialize(){
40 - return score+" "+relates;
 40+ return ((float)score)+" "+relates;
4141 }
4242
4343 public static ArrayList<String> convertToStringList(Collection<CompactRelated> rel){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedBuilder.java
@@ -18,11 +18,14 @@
1919 import org.mediawiki.dumper.ProgressFilter;
2020 import org.mediawiki.dumper.Tools;
2121 import org.mediawiki.importer.XmlDumpReader;
 22+import org.wikimedia.lsearch.beans.Title;
2223 import org.wikimedia.lsearch.config.Configuration;
2324 import org.wikimedia.lsearch.config.GlobalConfiguration;
2425 import org.wikimedia.lsearch.config.IndexId;
 26+import org.wikimedia.lsearch.config.IndexRegistry;
2527 import org.wikimedia.lsearch.index.IndexThread;
2628 import org.wikimedia.lsearch.ranks.Links;
 29+import org.wikimedia.lsearch.search.NamespaceFilter;
2730 import org.wikimedia.lsearch.spell.api.Dictionary;
2831 import org.wikimedia.lsearch.spell.api.Dictionary.Word;
2932 import org.wikimedia.lsearch.storage.ArticleAnalytics;
@@ -46,7 +49,7 @@
4750 System.out.println("MediaWiki Lucene search indexer - build a map of related articles.\n");
4851
4952 Configuration.open();
50 - if(args.length > 2 && args.length < 1){
 53+ if(args.length > 2 || args.length < 1){
5154 System.out.println("Syntax: java RelatedBuilder <dbname> [<dump file>]");
5255 return;
5356 }
@@ -64,7 +67,7 @@
6568 if(dumpfile != null)
6669 rebuildFromDump(dumpfile,iid);
6770 else
68 - rebuildFromTemp(iid);
 71+ rebuildFromLinks(iid);
6972 } catch (IOException e) {
7073 log.fatal("Rebuild I/O error: "+e.getMessage());
7174 e.printStackTrace();
@@ -83,7 +86,8 @@
8487 // first pass - titles
8588 InputStream input = null;
8689 input = Tools.openInputFile(inputfile);
87 - TitleReader tr = new TitleReader(langCode);
 90+ NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);
 91+ TitleReader tr = new TitleReader(iid,langCode,nsf);
8892 XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
8993 reader.readDump();
9094 input.close();
@@ -104,32 +108,42 @@
105109 * Rebuild related articles index for iid
106110 * @throws IOException
107111 */
108 - public static void rebuildFromTemp(IndexId iid) throws IOException {
 112+ public static void rebuildFromLinks(IndexId iid) throws IOException {
109113 CompactLinks links = new CompactLinks();
110 - Links temp = Links.openExisting(iid);
 114+ Links temp = Links.openForRead(iid,iid.getLinks().getImportPath());
111115
112 - log.info("Reading all titles");
 116+ NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);
 117+ log.info("Reading titles in default search");
113118 Dictionary dict = temp.getKeys();
114119 Word w;
115120 HashMap<Integer,CompactArticleLinks> keyCache = new HashMap<Integer,CompactArticleLinks>();
116121 while((w = dict.next()) != null){
117122 String key = w.getWord();
118 - links.add(key,temp.getNumInLinks(key));
119 - keyCache.put(temp.getDocId(key),links.get(key));
 123+ int ns = Integer.parseInt(key.substring(0,key.indexOf(':')));
 124+ if(nsf.contains(ns)){
 125+ links.add(key,temp.getNumInLinks(key));
 126+ keyCache.put(temp.getDocId(key),links.get(key));
 127+ }
120128 }
121129
122130 log.info("Reading in/out links");
123131 dict = temp.getKeys();
124132 while((w = dict.next()) != null){
125133 String key = w.getWord();
126 - CompactArticleLinks l = links.get(key);
127 - // inlinks
128 - l.setInLinks(temp.getInLinks(l,keyCache));
129 - // outlinks
130 - ArrayList<CompactArticleLinks> out = new ArrayList<CompactArticleLinks>();
131 - for(String k : temp.getOutLinks(key).toCollection())
132 - out.add(links.get(k));
133 - l.setOutLinks(out);
 134+ int ns = Integer.parseInt(key.substring(0,key.indexOf(':')));
 135+ if(nsf.contains(ns)){
 136+ CompactArticleLinks l = links.get(key);
 137+ // inlinks
 138+ l.setInLinks(temp.getInLinks(l,keyCache));
 139+ // outlinks
 140+ ArrayList<CompactArticleLinks> out = new ArrayList<CompactArticleLinks>();
 141+ for(String k : temp.getOutLinks(key).toCollection()){
 142+ CompactArticleLinks cs = links.get(k);
 143+ if(cs != null)
 144+ out.add(cs);
 145+ }
 146+ l.setOutLinks(out);
 147+ }
134148 }
135149 temp.close();
136150 temp = null; // GC
@@ -144,14 +158,19 @@
145159 RelatedStorage store = new RelatedStorage(iid);
146160 int num = 0;
147161 int total = links.getAll().size();
148 - for(CompactArticleLinks cs : links.getAll()){
 162+ NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);
 163+ for(CompactArticleLinks cs : links.getAll()){
149164 num++;
150165 if(num % 1000 == 0)
151 - log.info("Storing ["+num+"/"+total+"]");
152 - ArrayList<CompactRelated> rel = getRelated(cs,links);
153 - if(rel.size() == 0)
154 - continue;
155 - store.addRelated(cs.toString(),rel);
 166+ log.info("Storing ["+num+"/"+total+"]");
 167+ Title t = new Title(cs.getKey());
 168+ // do analysis only for default search namespace (usually main namespace)
 169+ if(nsf.contains(t.getNamespace())){
 170+ ArrayList<CompactRelated> rel = getRelated(cs,links);
 171+ if(rel.size() == 0)
 172+ continue;
 173+ store.addRelated(cs.toString(),rel);
 174+ }
156175 }
157176 store.snapshot();
158177 }
@@ -161,15 +180,19 @@
162181 */
163182 public static ArrayList<CompactRelated> getRelated(CompactArticleLinks cs, CompactLinks links){
164183 ArrayList<CompactRelated> ret = new ArrayList<CompactRelated>();
165 -
166 - HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>();
 184+
 185+ HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>();
 186+ double maxnorm = 0; // maximal value for related score, used for scaling
167187 if(cs.linksIn != null){
168 - for(CompactArticleLinks csl : cs.linksIn)
 188+ for(CompactArticleLinks csl : cs.linksIn){
169189 ll.add(csl);
 190+ maxnorm += 1.0/norm(csl.links);
 191+ }
170192 }
171193 for(CompactArticleLinks from : ll){
172194 if(from != cs){
173 - double score = relatedScore(cs,ll,from);
 195+ double rscore = relatedScore(cs,ll,from);
 196+ double score = (rscore / maxnorm) * rscore;
174197 if(score != 0)
175198 ret.add(new CompactRelated(cs,from,score));
176199 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/LinkReader.java
@@ -80,34 +80,14 @@
8181 rank = links.get(key);
8282 if(rank != null)
8383 return rank;
84 - // try lowercase
85 - key = ns+":"+title.toLowerCase();
86 - rank = links.get(key);
87 - if(rank != null)
88 - return rank;
8984 // try lowercase with first letter upper case
9085 if(title.length()==1)
9186 key = ns+":"+title.toUpperCase();
9287 else
93 - key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase();
 88+ key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1);
9489 rank = links.get(key);
9590 if(rank != null)
9691 return rank;
97 - // try title case
98 - key = ns+":"+WordUtils.capitalize(title);
99 - rank = links.get(key);
100 - if(rank != null)
101 - return rank;
102 - // try upper case
103 - key = ns+":"+title.toUpperCase();
104 - rank = links.get(key);
105 - if(rank != null)
106 - return rank;
107 - // try capitalizing at word breaks
108 - key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});
109 - rank = links.get(key);
110 - if(rank != null)
111 - return rank;
11292
11393 return null;
11494 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedTitle.java
@@ -1,10 +1,13 @@
22 package org.wikimedia.lsearch.related;
33
 4+import java.util.ArrayList;
 5+
46 import org.wikimedia.lsearch.beans.Title;
57
68 public class RelatedTitle {
79 protected Title related;
810 protected double score;
 11+ protected ArrayList<String> contexts = null;
912
1013 public RelatedTitle(Title related, double score) {
1114 this.related = related;
@@ -22,6 +25,12 @@
2326 public void setScore(double score) {
2427 this.score = score;
2528 }
 29+ public ArrayList<String> getContexts() {
 30+ return contexts;
 31+ }
 32+ public void setContexts(ArrayList<String> contexts) {
 33+ this.contexts = contexts;
 34+ }
2635 @Override
2736 public String toString() {
2837 return related.toString()+" ("+score+")";
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/TitleReader.java
@@ -12,6 +12,8 @@
1313 import org.mediawiki.importer.Revision;
1414 import org.mediawiki.importer.Siteinfo;
1515 import org.wikimedia.lsearch.beans.ArticleLinks;
 16+import org.wikimedia.lsearch.config.IndexId;
 17+import org.wikimedia.lsearch.search.NamespaceFilter;
1618 import org.wikimedia.lsearch.util.Localization;
1719
1820 /**
@@ -25,9 +27,13 @@
2628 Revision revision;
2729 CompactLinks links = new CompactLinks();
2830 protected String langCode;
 31+ protected IndexId iid;
 32+ protected NamespaceFilter nsf;
2933
30 - public TitleReader(String langCode){
 34+ public TitleReader(IndexId iid, String langCode, NamespaceFilter nsf){
3135 this.langCode = langCode;
 36+ this.iid = iid;
 37+ this.nsf = nsf;
3238 }
3339
3440 public void writeRevision(Revision revision) throws IOException {
@@ -37,8 +43,10 @@
3844 this.page = page;
3945 }
4046 public void writeEndPage() throws IOException {
41 - String key = page.Title.Namespace+":"+page.Title.Text;
42 - links.add(key,0);
 47+ if(nsf.contains(page.Title.Namespace)){
 48+ String key = page.Title.Namespace+":"+page.Title.Text;
 49+ links.add(key,0);
 50+ }
4351 }
4452 public CompactLinks getTitles() {
4553 return links;
@@ -54,7 +62,7 @@
5563 Iterator it = info.Namespaces.orderedEntries();
5664 while(it.hasNext()){
5765 Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
58 - Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);
 66+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),iid.getDBname());
5967 }
6068 }
6169 public void writeStartWiki() throws IOException {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java
@@ -73,7 +73,7 @@
7474 log.debug("Calling remotely indexUpdate("+myhost+","+iid+") on "+host);
7575 r.indexUpdated(myhost,iid.toString());
7676 } catch (Exception e) {
77 - log.warn("Error invoking remote method notifyIndexUpdated() on host "+host);
 77+ log.warn("Error invoking remote method notifyIndexUpdated() on host "+host+" : "+e.getMessage());
7878 continue;
7979 }
8080 }
@@ -102,10 +102,10 @@
103103 log.debug("Got new RMI messenger for host "+host);
104104 return r;
105105 } catch (RemoteException e) {
106 - log.warn("Cannot contact RMI registry for host "+host);
 106+ log.warn("Cannot contact RMI registry for host "+host+" : "+e.getMessage());
107107 throw e;
108108 } catch (NotBoundException e) {
109 - log.warn("No RMIMessenger instance at host "+host);
 109+ log.warn("No RMIMessenger instance at host "+host+" : "+e.getMessage());
110110 throw e;
111111 }
112112 }
@@ -126,7 +126,7 @@
127127 return res;
128128 } catch (Exception e) {
129129 //e.printStackTrace();
130 - log.warn("Error invoking remote method getIndexTimestamp() on host "+host);
 130+ log.warn("Error invoking remote method getIndexTimestamp() on host "+host+" : "+e.getMessage());
131131 }
132132 return null;
133133 }
@@ -137,7 +137,7 @@
138138 log.debug("Calling enqueueUpdateRecords("+records.length+" records) on "+host);
139139 r.enqueueUpdateRecords(records);
140140 } catch (Exception e) {
141 - log.warn("Error invoking remote method enqueueUpdateRecords() on host "+host);
 141+ log.warn("Error invoking remote method enqueueUpdateRecords() on host "+host+" : "+e.getMessage());
142142 throw e;
143143 }
144144 }
@@ -148,7 +148,7 @@
149149 log.debug("Calling enqueueFrontend("+records.length+" records) on "+host);
150150 r.enqueueFrontend(records);
151151 } catch (Exception e) {
152 - log.warn("Error invoking remote method enqueueFrontend() on host "+host);
 152+ log.warn("Error invoking remote method enqueueFrontend() on host "+host+" : "+e.getMessage());
153153 throw e;
154154 }
155155 }
@@ -159,7 +159,7 @@
160160 log.debug("Calling reportBack("+cards.length+" records) on "+host);
161161 r.reportBack(cards);
162162 } catch (Exception e) {
163 - log.warn("Error invoking remote method sendReports on host "+host);
 163+ log.warn("Error invoking remote method sendReports on host "+host+" : "+e.getMessage());
164164 }
165165 }
166166
@@ -177,7 +177,7 @@
178178 cache.invalidateSearchable(iid,host);
179179 SearchResults res = new SearchResults();
180180 res.retry();
181 - log.warn("Error invoking remote method searchPart on host "+host);
 181+ log.warn("Error invoking remote method searchPart on host "+host+" : "+e.getMessage());
182182 return res;
183183 }
184184 }
@@ -188,7 +188,7 @@
189189 log.debug("Calling requestFlushAndNotify("+dbname+" records) on "+host);
190190 return r.requestFlushAndNotify(dbname);
191191 } catch (Exception e) {
192 - log.warn("Error invoking remote method requestFlushAndNotify on host "+host);
 192+ log.warn("Error invoking remote method requestFlushAndNotify on host "+host+" : "+e.getMessage());
193193 return false;
194194 }
195195 }
@@ -199,7 +199,7 @@
200200 log.debug("Calling isSuccessfulFlush("+dbname+" records) on "+host);
201201 return r.isSuccessfulFlush(dbname);
202202 } catch (Exception e) {
203 - log.warn("Error invoking remote method isSuccessfulFlush on host "+host);
 203+ log.warn("Error invoking remote method isSuccessfulFlush on host "+host+" : "+e.getMessage());
204204 throw new IOException("Remote error");
205205 }
206206 }
@@ -212,7 +212,7 @@
213213 log.debug(" \\-> got: "+size);
214214 return size;
215215 } catch (Exception e) {
216 - log.warn("Error invoking remote method getIndexerQueueSize on host "+host);
 216+ log.warn("Error invoking remote method getIndexerQueueSize on host "+host+" : "+e.getMessage());
217217 return -1;
218218 }
219219 }

Status & tagging log