r26333 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r26332‎ \| r26333 \| r26334 >
Date:	23:10, 2 October 2007
Author:	rainman
Status:	old
Tags:
Comment:	Experimental: * extract context for each link and index it * context retrieval uses: - StringMap for efficient hashmap serialization - ObjectCache a simple FIFO cache for caching context fields * experiment with various scoring schemes, use related as boost for sloppy phrase matches, rank as boost for exact phrases - probably will be changed futher Devel: * drop link_analysis index, maintain an index of pagelinks and redirect tables in links index * fix bug 11103, use FSUtils for all filesystem related operations * localization: read meta namespaces names per dbname, pass iid to wikitokenizer, etc.. * RelatedBuilder can build related mapping from both links index and from dump, cleanup and field compression * new search method: related (returns the related mapping for article)
Modified paths:	/branches/lucene-search-2.1/lsearch-global.conf (modified) (history) /branches/lucene-search-2.1/src/org/apache/lucene (added) (history) /branches/lucene-search-2.1/src/org/apache/lucene/search (added) (history) /branches/lucene-search-2.1/src/org/apache/lucene/search/CustomBoostQuery.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ContextAnalyzer.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/RelatedAnalyzer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SplitAnalyzer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/ResultSet.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Title.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/HttpHandler.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/CleanupParser.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/HighlightDaemon.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiSimilarity.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/ContextParser.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/ObjectCache.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RankBuilder.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/StringMap.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/TitleReader.java (deleted) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/CompactRelated.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/LinkReader.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedBuilder.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedTitle.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/TitleReader.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankDocValues.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankValueSource.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankValueSourceQuery.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/RelatedStorage.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Command.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/FSUtils.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Localization.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/PHPParser.java (modified) (history) /branches/lucene-search-2.1/test-data/indexing-articles.test (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/lsearch-global.conf
—	—	@@ -18,19 +18,20 @@
19	19	wikilucene : (nssplit,3) (nspart1,[0]) (nspart2,[4,5,12,13]), (nspart3,[])
20	20	wikilucene : (language,en) (warmup,10)
21	21	wikilucene : (spell,3,1) (prefix)
	22	+enwiki,viwiki,srwiki,eswiki,dewiki,mlwiki,zhwiki,jawiki,itwiki,thwiki : (single)
22	23
23	24	# Search groups
24	25	# Index parts of a split index are always taken from the node's group
25	26	# host : db1.part db2.part
26	27	# Mulitple hosts can search multiple dbs (N-N mapping)
27	28	[Search-Group]
28		~~-oblak : wikilucene wikidev wikilucene.prefix~~
	29	+oblak : wikilucene wikidev wikilucene.prefix wikilucene.related wikilucene.links
29	30
30	31	# Index nodes
31	32	# host: db1.part db2.part
32	33	# Each db.part can be indexed by only one host
33	34	[Index]
34		~~-oblak: wikilucene wikidev~~
	35	+oblak: enwiki wikilucene wikidev viwiki srwiki eswiki dewiki mlwiki zhwiki jawiki itwiki thwiki
35	36
36	37	# Rsync path where indexes are on hosts, after default value put
37	38	# hosts where the location differs
Index: branches/lucene-search-2.1/test-data/indexing-articles.test
—	—	@@ -391,3 +391,419 @@
392	392	[[tr:Maxwell denklemleri]]
393	393	[[zh:麦克斯韦方程组]]
394	394
	395	+### namespace = 0
	396	+### title = Douglas Adams
	397	+### content
	398	+
	399	+==Education and early works==
	400	+[[Image:Douglas Adams Sign from HH cover.jpg\|200px\|right\|thumb\|Douglas Adams was known to some fans as ''Bop Ad'' - after his illegible signature]]Adams first attended Primrose Hill Primary School in Brentwood. He took the exams and interviewed for [[Brentwood School (England)\|Brentwood School]] at age six, and attended the [[Preparatory school (UK)\|preparatory school]] from 1959 to 1964, then the main school until 1970. He was in the top stream, and specialised in the arts in the sixth form, after which he stayed an extra term in a special seventh form class, customary in the school for those preparing for [[Oxbridge]] entrance exams.
	401	+
	402	+While at the prep school, his English teacher, Frank Halford, reportedly
	403	+awarded Adams the only ten out of ten of his entire teaching career for a
	404	+creative writing exercise.<ref>http://www.bbc.co.uk/dna/h2g2/A3790659</ref>
	405	+Adams remembered this for the rest of his life, especially when facing writer's
	406	+block.<ref>{{cite book \| author=Adams, Douglas \| title=The Salmon of Doubt:
	407	+Hitchhiking the Galaxy One Last Time \| edition=US mass market paperback edition \|
	408	+publisher=Ballantine \| year=2005 \| pages=Page xix \| id=ISBN 0-345-45529-0}}</ref>
	409	+Some of Adams' earliest writing was published at the school, such as a report on
	410	+the school's Photography Club in ''The Brentwoodian'' (in 1962) or spoof reviews
	411	+in the school magazine ''Broadsheet'' (edited by [[Paul Neil Milne Johnstone]]).
	412	+Adams also had a letter and short story published nationally in the UK in the boys'
	413	+magazine ''The Eagle'' in 1965. He met [[Griff Rhys Jones]], who was in the year below,
	414	+at the school, and was in the same class as "Stuckist" artist
	415	+[[Charles Thomson (artist)\|Charles Thomson]]; all three appeared together in
	416	+a production of Shakespeare's Julius Caesar in 1968. He was six feet tall (1.83 m)
	417	+by the time he was 12, and he stopped growing only at 6'5" (1.96 m).
	418	+Later, he would often make self-ironic jokes about his own towering stature,
	419	+"...the form-master wouldn't say 'Meet under the clock tower,' or
	420	+'Meet under the War Memorial,' but 'Meet under Adams.'"
	421	+<ref>{{cite book \| author=Adams, Douglas \| title=The Salmon of Doubt:
	422	+Hitchhiking the Galaxy One Last Time \| edition=First UK hardcover edition \|
	423	+publisher=Macmillan \| year=2002 \| pages=Page 7 \| id=ISBN 0-333-76657-1}}</ref>
	424	+
	425	+### namespace = 0
	426	+### title = Aaliyah
	427	+### content
	428	+
	429	+{{Two other uses\|\|Aaliyah's self-titled album\|Aaliyah (album)\|\|Aliyah (disambiguation)}}
	430	+{{Infobox musical artist <!-- See Wikipedia:WikiProject Musicians -->
	431	+\| Name = Aaliyah
	432	+\| Img = Aaliyah5301.jpg<!--fair use image to be used only in this article-->
	433	+\| Img_capt = Promotional photo of Aaliyah from May 2001
	434	+\| Img_size = <!-- Only for images narrower than 220 pixels -->
	435	+\| Landscape =
	436	+\| Background = solo_singer
	437	+\| Birth_name = Aaliyah Dana Haughton<ref name="Aaliyah NNDB Profile">{{cite web\| url =http://www.nndb.com/people/742/000024670/\| title = Aaliyah NNDB Profile\| publisher =NNDB\| accessdate =2007-03-03}}</ref>
	438	+\| Alias = Baby Girl, The Princess of Hip-Hop Soul<br>Liyah<br>Wonder Woman<ref>[http://www.imdb.com/name/nm0004691/bio Aaliyah on IMDb]</ref>
	439	+\| Born = {{birth date\|1979\|1\|16}}<br><small>[[Brooklyn]], [[New York]], [[United States\|U.S.]]</small>
	440	+\| Died = {{death date and age\|2001\|8\|25\|1979\|1\|16}}<br><small>[[Abaco Islands]], [[The Bahamas]]</small>
	441	+\| Origin = {{Flagicon\|USA}} [[Detroit, Michigan\|Detroit]], [[Michigan]], [[United States\|U.S.]]
	442	+\| Genre = [[R&B]], [[Hip hop soul]], [[Dance music\|Dance]], [[Pop music\|Pop]]
	443	+\| Occupation = [[Singer]], [[Model (person)\|model]], [[dancer]], [[actress]]
	444	+\| Years_active = 1994 – 2001
	445	+\| Label = [[Blackground Records\|Blackground]]
	446	+\| Associated_acts = [[Missy Elliott]], [[Timbaland]], [[Steve "Static" Garrett\|Static]], [[R. Kelly]], [[Ginuwine]], [[Tweet (singer)\|Tweet]]
	447	+\| URL = [http://www.aaliyah.com Aaliyah.com]
	448	+}}
	449	+'''Aaliyah Dana Haughton'''<ref name="Aaliyah NNDB Profile"/> ([[January 16]], [[1979]] – [[August 25]], [[2001]]), known professionally as '''Aaliyah''', was a [[Grammy Award]] winning [[United States\|American]] [[singer]], [[dancer]], [[Model (person)\|model]] and [[actress]]. Introduced to audiences by R&B singer [[R. Kelly]], Aaliyah became famous during the mid-1990s with several hit records from the songwriting/production team of [[Missy Elliott]] & [[Timbaland]] and their associate [[Steve "Static" Garrett]]. Aaliyah soon joined Timbaland's R&B and hip hop collective, the [[Superfriends Clique]].
	450	+
	451	+Notable for recording several hit records, including several number one R&B hits, a number one pop hit, and nine top 10 singles on the [[Billboard Hot 100]]. She also modeled for [[Tommy Hilfiger]] and starred in two [[motion pictures]] before dying in a plane crash in [[the Bahamas]].
	452	+
	453	+==Early years==
	454	+Aaliyah Dana Haughton was born in Bedford Stuyvesant, Brooklyn, New York on January 16, 1979 to Michael and Diane Haughton, and was raised in Detroit, Michigan. Her name means "Highest, Most Exalted, The Best" in [[Arabic language\|Arabic]] or "to ascend" in [[Hebrew]]. Aaliyah was brought up as a [[Catholic]] with her older brother [[Rashad Haughton]]. Her grandmother, Mintis L. Hicks Hankerson, was of [[African American]] and [[Native Americans in the United States\|Native American]] descent. Diane Haughton, Aaliyah's mother, also a vocalist, encouraged her daughter's career. Her uncle, [[Barry Hankerson]], is a prominent individual in the music industry and Aaliyah's aunt, through marriage to Hankerson, is [[Gladys Knight]], a legendary soul singer with [[Gladys Knight & the Pips]].
	455	+
	456	+She appeared on the TV talent show ''Star Search'' at age ten, singing her mother's favorite song, "My Funny Valentine". Although she did not win, Aaliyah worked with an agent in New York and began to attend auditions for TV shows, including ''[[Family Matters (TV series)\|Family Matters]]''.
	457	+
	458	+Following her appearance on ''Star Search'' Aaliyah performed on stage in [[Las Vegas]] with Gladys Knight. In her early teens, Aaliyah attended the Detroit High School for the Fine and Performing Arts, and graduated as a dance major with a 4.0 GPA
	459	+
	460	+==''Age Ain't Nothing But a Number'' (1994)==
	461	+[[Image:Aaliyah-age-aint-94.jpg\|right\|200px\|thumb\|Cover of ''[[Age Ain't Nothing but a Number]]''.]]
	462	+Aaliyah signed with her uncle [[Barry Hankerson]]'s [[Blackground Records]] label in 1993 at the age of 14. She released her debut album, titled ''[[Age Ain't Nothing but a Number]]'', in 1994 at the age of 15. [[R. Kelly]], Aaliyah's then alleged husband, was a leading songwriter and producer on her debut album. The album displayed her smooth and velvety vocals and the production work was said to be original and innovative. The album went [[platinum album\|platinum]] within months. The album featured the gold-selling singles "[[Back and Forth (Aaliyah song)\|Back and Forth]]" (#1 U.S. R&B, 3 weeks and #5 [[Hot 100]] ), "[[At Your Best]]" (#2 U.S. R&B and #6 [[Hot 100]] ), a cover of [[The Isley Brothers]]' 1976 song, the album-titled single "[[Age Ain't Nothing But A Number (song)\|Age Ain't Nothing But A Number]]" (#75 [[Hot 100]] and #35 US R&B, 2 weeks), and "Down with the Clique" (#33 UK Top 75 Singles). "Back and Fourth" was sampled by [[Madonna (entertainer)\|Madonna]] for the track, "[[Inside of Me]]" which appears on her 1994 album ''[[Bedtime Stories]]''. In June 1995, Aaliyah released another single to radio only, "No One Knows How to Love Me Quite Like You Do." The album has sold over 3.5 million copies in the U.S. to date and nearly 7 million worldwide.
	463	+
	464	+In 1994, a rumour surfaced that 15-year-old Aaliyah and 27-year-old R. Kelly had secretly married in the state of [[Illinois]]. Both initially denied. Although many websites and television shows claimed that they found a marriage certificate, it never has been truly proven that R. Kelly and Aaliyah have ever been married.
	465	+
	466	+==Guest appearances, movie roles and soundtracks (1995 - 2001)==
	467	+[[Image:romeo must die dvd.jpg\|150px\|left\|thumb\|Aaliyah on the cover of her first film ''[[Romeo Must Die]] {{speedy-image-c}}'' alongside [[Jet Li]]]]
	468	+In 1995 at age 16, Aaliyah performed "[[The Star-Spangled Banner]]" live at an [[Orlando Magic]] basketball game. Also during that year, she appeared on the soundtrack for ''[[A Low Down Dirty Shame]]'' with the minor international hit "The Thing I Like" (#33 UK). The song was also included on international versions of [[Age Ain't Nothing But A Number]].
	469	+
	470	+In 1997, Aaliyah appeared on the soundtrack album for the [[Fox Animation Studios]] animated feature ''[[Anastasia (1997 movie)\|Anastasia]]'', singing the pop version of "[[Journey to the Past]]". The song was nominated for an [[Academy Awards\|Academy Award]], and Aaliyah performed the song at the 1998 Academy Awards ceremony, becoming the youngest female recording artist to perform at the ceremony. Not only was Aaliyah the youngest female to perform but she was the youngest African American to have the nominee for [[Academy Award for Best Original Song\|Best Original Song]].
	471	+
	472	+Aaliyah had a hit in 1998 with "[[Are You that Somebody]]" (number one airplay U.S. eight weeks), the main single from the ''[[Dr. Dolittle (film)\|Dr. Dolittle]]'' soundtrack. Its video was the third most-played on [[MTV]] that year, and the song's success helped make Aaliyah a household name (and making her crowned as Queen of Urban Pop).
	473	+
	474	+In 2000, she co-starred with [[Jet Li]] in the [[martial arts]] film ''[[Romeo Must Die]]'', which debuted at number one at the box office. Aaliyah and Timbaland executive produced the film's soundtrack album and Aaliyah contributed four songs: "Are You Feelin' Me?," "I Don't Wanna," "Come Back in One Piece," a duet with [[DMX (rapper)\|DMX]], and the international number one hit "[[Try Again]]." Aaliyah made history once more when "Try Again" became the first song to ever reach number one on the Billboard Hot 100 based solely on the strength of its radio airplay, without any single sales factored in. After the huge success of "Try Again" at radio, a [[12 inch single\|12" maxi single]] was released for consumer purchase. The radio-only single, "I Don't Wanna", (which was also featured on the soundtracks for the films ''[[Next Friday]]'' and ''Romeo Must Die'') peaked at number five on the Billboard Hot R&B/Hip Hop Singles & Tracks chart.
	475	+
	476	+In 2001, Aaliyah went to [[Australia]] to co-star with [[Stuart Townsend]] in the film ''[[Queen of the Damned (film)\|Queen of the Damned]]'', an adaptation of the [[Anne Rice]] novel of the same name. Aaliyah also recorded most of her third studio album, ''[[Aaliyah (album)\|Aaliyah]]'', during this time.
	477	+
	478	+==''One in a Million'' (1996)==
	479	+[[Image:aaliyah-one-in-a-million.jpg\|200px\|right\|thumb\|Cover of ''[[One in a Million (album)\|One In A Million]]''.]]
	480	+''[[One in a Million (album)\|One In A Million]]'', Aaliyah's sophomore album, was chiefly written and produced by then unknowns [[Missy Elliott]] and [[Timbaland\|Timothy "Timbaland" Mosley]] and released on [[August 27]], [[1996]] when she was 17 years old. The album was a landmark in Aaliyah's career, garnering her mass critical acclaim and introducing Aaliyah's more mature side. It embarked the newfound chemistry of Aaliyah and Timbaland. The album was certified double-platinum within a year, making Aaliyah a major R&B star and igniting the successful careers of Missy Elliott and Timbaland. ''One in a Million'' featured the international smash hit "[[If Your Girl Only Knew]]" (number one U.S. R&B, 2 weeks and #11 Hot 100), "[[One in a Million (Aaliyah song)\|One In A Million]]," (#1 U.S. R&B airplay, six weeks & #25 US Hot 100 Airplay), the #8 U.S. R&B and #9 [[Hot 100]] single "[[The One I Gave My Heart To]]," a ballad written by [[Diane Warren]], "[[4 Page Letter]]" (#12 R&B Airplay), "[[Hot Like Fire (Timbaland Remix)\|Hot Like Fire]]" (two versions) (#31 R&B Airplay), and "[[Got to Give It Up (Aaliyah song)\|Got To Give It Up]](#37 UK)" (a remake of the [[1977]] [[Marvin Gaye]] song).
	481	+
	482	+[[Tommy Hilfiger]] gave Aaliyah her first endorsement deal. He signed Aaliyah onto print campaigns, runway shows, and a commercial. During this period, Aaliyah would also make guest appearances on albums by artists such as [[Missy Elliott]], [[Timbaland & Magoo]], [[Ginuwine]] and [[Playa (band)\|Playa]]. [[Timbaland]] and [[Playa]]'s frontman [[Steve "Static" Garrett]] would remain Aaliyah's principal collaborators for the duration of her career. To date, ''One in a Million'' has sold over 3.7 million copies in the U.S. and over 11 million worldwide.
	483	+
	484	+After the success of ''One in a Million'', in 1997 Aaliyah headlined in her own tour "The Hot Like Fire Tour", in which she toured various major city venues performing hits from Her albums.
	485	+
	486	+==''Aaliyah'' (2001)==
	487	+<!--[[Image:Aaliyah-ep-2001.jpg\|200px\|left\|thumb\|Cover of [[Aaliyah (album)\|Aaliyah]] commenting out image with no source/bad FairUse claim-->
	488	+"[[We Need a Resolution (Aaliyah song)\|We Need a Resolution]]," the first single from Aaliyah's third studio album, was released [[April 24]], 2001 (see [[2001 in music]]). The self-titled ''[[Aaliyah (album)\|Aaliyah]]'' was released three months later on [[July 17]], [[2001]]. The album was an instant critical success but sales were initially lower than expected, although they increased considerably after her death. ''Aaliyah'' introduced a darker and edgier side to Aaliyah's music and was noted as having showcased her growth as an artist. Around the time of the album's recording and release she had been filming ''Queen of the Damned'', which helped her show a dark and edgy side as her character was a deadly villain. The album debuted at number two on the [[Billboard 200]] chart, selling 190,000 copies in its first week, and was certified gold (500,000 copies sold) within four weeks, before her death. The week after the plane crash it climbed to number one. [[Trent Reznor]] of [[Nine Inch Nails]] was to produce a song on the album but scheduling conflicts did not permit the collaboration.
	489	+
	490	+==="More Than A Woman" and "Rock The Boat"===
	491	+There was no shortage of confusion at the label regarding the next single from the Aaliyah album. Aaliyah had been promoting "[[More Than a Woman (Aaliyah song)\|More Than a Woman]]", having performed it twice and shooting a video with director Dave Meyers in the summer of 2001. According to Blackground, a remix featuring State Property and Jay-Z was also planned, but was scrapped due to lack of adequate funds. The video was to be released but "[[Rock the Boat (Aaliyah song)\|Rock the Boat]]" began receiving huge amounts of radio-play, so she was immediately sent to the Bahamas to shoot the video. The "[[Rock The Boat]]" music video was put in the 106 and Park hall of fame, making the countdown over 65 times and landed at #2 on [[BET]]'s Top 100 videos of 2001. "[[More Than a Woman]]" made the number-one spot after "Rock the Boat" was retired. "Rock the Boat" was #2 U.S. R&B and #14 Hot 100 single. "I Care 4 U" was #3 U.S. R&B and #16 Hot 100 single. The album went on to sell over 8 million copies worldwide.
	492	+
	493	+==Death==
	494	+On August 25, 2001, at 6:49 pm, just after wrapping up filming of the "Rock the Boat" video, Aaliyah and various members of her record company boarded a twin engine [[Cessna 402]]B (N8097W) at Marsh Harbour, Abaco Island, Bahamas to travel to Opa-locka Airport near Miami, Florida, but the plane crashed shortly after takeoff about 200 feet from the runway. Pilot Luis Morales III and all eight passengers, including Aaliyah, were killed in the aviation incident. According to findings from an inquest conducted by the coroner's office in the Bahamas, Aaliyah suffered from "severe burns and a blow to the head," in addition to severe shock. The coroner theorized that, even if Aaliyah survived the crash, her recovery would have been virtually impossible given the severity of her injuries.<ref>[http://www.caribbeannetnews.com/2003/11/21/aaliyah.htm Bahamas Coroner delivers verdict in Aaliyah death crash.] (2003, November 21). ''Caribbean News''. Retrieved February 9, 2007.</ref>
	495	+
	496	+Aaliyah's eulogy was held on August 31 at Saint Ignatius Loyola Roman Catholic Church on East 84th Street in Manhattan. A horse-drawn carriage then carried her coffin to [[Ferncliff Cemetery]] in Hartsdale, New York, where she was initially interred in a crypt in the extension wing of the main mausoleum. When the Rosewood Mausoleum was completed a couple of years later, Aaliyah was moved to a private room in the new building. The inscription of her alias ''Baby Girl'' is engraved on her crypt.
	497	+
	498	+===Investigation===
	499	+{{Copypaste}}
	500	+
	501	+NTSB reports indicate that the pilot, Luis Morales III, was not qualified to pilot the plane he was attempting to fly. Morales falsely obtained his FAA license by showing hundreds of hours never flown, and he may also have falsified how many hours he had flown in order to get a job with his employer, Blackhawk International Airways. Additionally, an autopsy performed on Morales revealed cocaine and alcohol in his blood.
	502	+
	503	+Further investigations determined the plane was over its total gross weight by several hundred pounds. Although witnesses claimed that the passengers had been asked to leave some luggage behind, it was later discovered that the passengers, including Aaliyah, had not been informed of the excess weight.
	504	+
	505	+Eddie Golson, president of Pro Freight Cargo Services at Opa-locka Airport, said workers carted "a pickup truck of freight" from the crash site Monday. "That's absurd to think that this pilot got in this airplane with eight other people and a truck full of freight and expected this thing to fly," Golson said. "What the hell was going on?" a baggage handler was reported to have said, in reaction to hearing that no one weighed the passengers or baggage. Two of the passengers, members of Aaliyah's entourage, weighed in the neighborhood of 300 pounds and sat in the rear of the plane, where the baggage was also stored.
	506	+
	507	+The day of the aviation incident was Mr. Morales' first official day with Blackhawk International Airways, a Part 135 single-pilot operation. He had been employed with Golden Airlines, from which he was fired only four hours before the fatal aviation incident. In addition, Luis Morales III was not registered with the FAA to fly for Blackhawk. As a result of the incident, a wrongful death lawsuit was filed by Aaliyah’s parents and was later settled out of court for an undisclosed amount.
	508	+
	509	+Barry & Sons, Inc., a corporation formed in 1992 to develop, promote and capitalize on the musical talents of Aaliyah and to oversee the production and distribution of her records, tapes and music videos, brought an unsuccessful lawsuit in the Supreme Court of the State of New York against Instinct Productions LLC, (a company hired by Barry & Sons, Inc. in August, 2001 to produce the "Rock the Boat" music video). The case was dismissed since New York State's wrongful death statute only permits certain people to recover damages for wrongful death.<ref>[http://www.courts.state.ny.us/reporter/3dseries/2005/2005_00096.htm Text of appellate division decision dismissing the case.]</ref>
	510	+
	511	+==Legacy==
	512	+"Rock the Boat" went on to become a [[Posthumous work\|posthumous]] hit on radio (reaching number two on Billboard's Hot R&B Singles charts, number 14 on the Hot 100, and number 12 in the UK) and video channels, and the news of Aaliyah's death gave her album a notable sales boost, pushing it to number one on the Billboard 200. The album produced two other singles. "More than A Woman" reached number 7 on Billboard's Hot R&B singles chart, number 25 on Hot 100, and number one in the UK. "I Care 4 U" reached number three on Billboard's Hot R&B singles chart and number 16 on the Hot 100, the latter attaining success even without the promotional push of a [[music video]]. The ''Aaliyah'' album went on to sell over 3 million copies in the U.S. ''[[Queen of the Damned (film)\|Queen of the Damned]]'' was released in early 2002. Before its release, Aaliyah's brother Rashad was called upon to re-dub several of his sister's lines during the post-production [[Dubbing (music)\|ADR]] process. Upon its release, the film debuted at number one. The film was also dedicated to her.
	513	+
	514	+In 2001, Missy Elliott released her video for "[[Miss E... So Addictive\|Take Away]]". The video contained words and images about Aaliyah. The single also featured Ginuwine and was the debut of Elliott's recent protégé, [[Tweet (singer)\|Tweet]].
	515	+
	516	+Aaliyah and former [[The Beatles\|Beatle]] [[George Harrison]] made UK Chart History in January 2002 when they scored the first, and to this date only, back-to-back posthumous number one hits (aside from the [[Elvis Presley]] re-releases in 2005). Aaliyah's "More than a Woman", released on January 7 and topped the chart on January 13, was followed by Harrison's "My Sweet Lord", re-released on January 14 and topped the chart on January 20. Aaliyah was voted one of "The Top 40 Woman of the Video Era" in VH1's The Greatest, also ranked #36 on their list of the 100 Sexiest Artist. Aaliyah also made E!'s list on the 101 Most Shocking Moments in Entertainment, Juciest Hollywood Hookups, and Best Kept Hollywood Secrets. Aaliyah recently ranked at #18 on BET's "Top 25 Dancers of All Time". In 2005, former Co-Star Jet-Li as reported from CNN stated the memory of Aaliyah had haunted him in Vancouver, where he and the late songstress shot the film [[Romeo Must Die]].
	517	+
	518	+In addition Aaliyah has been the topic of five books: ''Aaliyah: More than a Woman'' (2001) by Christopher John Farley, ''Aaliyah: An R&B Princess in Words and Pictures'' (2002) by Kelly Kenyatta, ''Aaliyah'' by [[Tim Footman]] (2003), and ''Aaliyah Remembered: Her Life & The Person behind the Mystique'' (2005) by William Sutherland.
	519	+
	520	+"Her legacy is, you can achieve your dreams and still maintain being a beautiful person"
	521	+-Brother [[Rashad Haughton]]
	522	+
	523	+===Lyrical Dedications===
	524	+
	525	+* [[Boyz II Men]]: "Think Of Aaliyah" a.k.a. "The Aaliyah Song" - "''When you think of Aaliyah, laugh, don't cry, cause I know she want it that way''".
	526	+* [[Brandy (entertainer)\|Brandy]]: "Turn It Up" - ''"Get Baby Girl's attention, she's more than a woman and we sure do miss her. I wanna represent her, Timbaland, Missy, you get the picture."'',"Should I Go" - "''this industry was more like a different world, when it was just me, Monica, and Baby Girl [Aaliyah], I never got to tell you how much you meant / I wish you and me both was sittin' here workin with Tim / Just to be in the presence of people that you affected on a personal level just makes me stop for a second. [inhales and exhales] You were such a blessing, you helped me answer all of my questions."''
	527	+* [[D12]]: "9-11" - ''"We lost Aaliyah, lost our families, it takes no tenges. You don't need us to see the world is (messed) up, God can see it"''
	528	+* [[Foxy Brown]]: "Big Bad Mama" - ''"Rhyme deep in footwear, via Spiga/ Like Aaliyah, One in a Million/There's MC's in this rap shit comin in illin/ like I did, laid the groundwork for five hits/ Member when I told y'all first week out/ Shipped a half a mil, niggaz freaked out/ Love yourself, put no one above thee/ Cause ain't nobody gon' fuck me like me, it's on'"''
	529	+
	530	+* [[Jadakiss]]: "Why" - ''"Why Aaliyah had to take that flight?"''
	531	+* [[Jay-Z]]: "Miss You Remix" - names certain people who missed her after her death.
	532	+* [[Juelz Santana]]: "One Day I Smile" - ''"Once again a deep thought of Aaliyah crosses my mind"''
	533	+* [[Kanye West]]: "Never Let Me Down" - ''"But I can't complain what the accident did to my Left Eye / Cuz look what an accident did to Left Eye / First Aaliyah and now Romeo must die / I know I got angels watching me from the other side"''
	534	+* [[Layzie Bone]]: "For The Thugs That's Gone" - " Too many celebrities perish, these people we love and cherish, and I had a chance to meet Aaliyah, but I was to embarrassed and I should of took a chance, I heard that from a man, Jam Master J was so real, you niggas don't understand, he told me to handle my business, make sure I pay my taxes, a little advice from a legend to keep my paper stacking, and I gotta give props to Eazy, that nigga put me on, if he didn't believe in the thugs you all wouldn't of heard of Bone.
	535	+* [[Lil' Flip]]: "Hall of Fame Graveyard" - ''"From Eazy E to Aaliyah, we even lost Left Eye / How come the wack rappers live but the best die"''
	536	+* [[Mary J. Blige]] - "[[MJB Da MVP]]" - ''"It was when Aaliyah died / I could hardly sleep / Thought about it every day / and it made Me change my ways"''
	537	+* [[Missy Elliott]]: "Can You Hear Me?" - ''"I been checkin' on your moms and dad / And your brother since the day you left / Passed on and went away with God / But for your mama it's been / So damn hard / I hate to even hear her cry / Aaliyah she asked me why / Would her baby girl go this way / Can you give me better words to say / Cause One day she'll see you again / With the same old beautiful smile / Long hair and the voice of a hummingbird / You'll be singing them same old songs / Aaliyah can you hear me? / I hope that you're proud of me / Me and Tim we been doing our thing / But it's never been the same / Since you had to go / I ain't never met a friend / More incredible"''
	538	+* [[Monica (singer)\|Monica]]: "Go To Bed Mad" - ''"Argue about things so critical / And you heated over nothing / And just hang up the phone / I want / I wanna talk in the mood / See we need a resolution / Like that Aaliyah song"''
	539	+* [[Mya]]: "After The Rain" - ''"No one could ever fill your shoes, you're one in a million"''
	540	+* [[Nas]] featuring [[Quan (rapper)\|Quan]]: "Just A Moment" - ''"And can we please have a moment to mourn? / For Pac, Biggie and Pun 'cause through us they live on / Jam Master Jay, Freaky Ty and Aaliyah / Big L and Left Eye, when we die we hope to see ya"''
	541	+* [[Ray J]]: "War Is Over" - ''"One day one day one day / I hope to see my girl Aaliyah"''
	542	+* [[The Game (rapper)\|The Game]]: "Dreams" - ''"Martin Luther King had a dream, Aaliyah had a dream, Left Eye had a dream"'', "Runnin" - ''"God let me in, give me a room by Aaliyah with ESPN"''
	543	+* [[T.I.]]: "Rubberband Man" - ''"throw your lighters up for my cousin Toot, Aaliyah, Left Eye, and Jam Master Jay"''
	544	+* [[TQ]] : "Gone But Not Forgotten" - ''"Aaliyah, I wish we could've did a song, but baby girl when I get my wings, I'm gonna send your precious love"''
	545	+* [[Wyclef Jean]]: "Industry" - ''"Back and forth and forth and back / Like Miss Aaliyah man do I miss her"''
	546	+* [[Outkast]] ft. [[Killer Mike]]: "The Whole World" - ''"Mami, I'm coming, I hope u get off / Or rock your own boat like Aaliyah don taught / Back, back and forth, forth / Get that sailor on course course"''
	547	+* French R'n'b singer [[Assia]] covered "Don't know what to tell ya" with French and Arabic lyrics and entitled it "Le prix pour t' aimer (Habibi Maareft Ach'n Oullek)" in her latest album "Encore et Encore".
	548	+The Gossip - (covered are you that somebody) as a tribute to Aaliyah
	549	+
	550	+Others include tracks by ''[[DMX]], [[Yolanda Adams]], [[Tyrese]], [[R. Kelly]], [[TLC (band)\|TLC]], [[Timbaland]] & Outsiderz 4 Life''.{{Fact\|date=March 2007}}
	551	+
	552	+* [[Cooper C.]]: "Why...?" - "Imma rock da boat, Aaliyah, and be wit you. One day, hopefully I will see you too."
	553	+
	554	+===Unfinished Films===
	555	+Aaliyah was to have had a supporting role as Zee, the wife of [[Harold Perrineau Jr.]]'s character, Link, in the two sequels to ''[[The Matrix]]''. The directors initially tried to find a way to incorporate her footage into the movies but decided against it due to lack of material available. The role was recast with [[Nona Gaye]] playing the character. Other films in which Aaliyah was signed to star in were ''[[Honey (2003 film)\|Honey]]'' (which instead was filmed with [[Jessica Alba]] as the main character), and a [[Whitney Houston]]-produced remake of the 1976 film ''[[Sparkle (1976 film)\|Sparkle]]'' (now in the works with [[Raven-Symoné]] as the main character).
	556	+
	557	+In addition, Aaliyah and one of her agents had pitched and signed a deal with Fox Searchlight Pictures for her to star in a film based upon a true story about interracial love.
	558	+
	559	+==''I Care 4 U'' (2002)==
	560	+[[Image:aaliyah icare4u.jpg\|right\|thumb\|200 px\|Cover of ''[[I Care 4 U]]''.]]
	561	+''I Care 4 U'' was released by Blackground Records on December 10, 2002. Along with her hit singles, a number of previously unreleased tracks were included on the album, including "[[Erica Kane]]", "Don't Worry" and "All I Need" and the new singles "Miss You", "Don't Know What to Tell Ya", and "Come Over." ''I Care 4 U'' debuted at an impressive #3 on the Billboard 200 and #1 on the R&B album charts (where it remained for 7 weeks). The album went on to sell over 2.6 million in the U.S. and 5 million worldwide.
	562	+
	563	+The video for "Miss You" features [[Missy Elliott]], [[Toni Braxton]], [[Lil' Kim]], [[Dallas Austin]], MTV presenter and close friend [[Ananda Lewis]], actor/singer [[Jamie Foxx]], [[AJ Calloway]], [[Free (rapper)\|Free]], [[Quddus (MTV)\|Quddus]], Missy's recent protegé and longtime friend [[Tweet]], [[U-God]] (of the ''[[Wu-Tang Clan]]'') and [[DMX (rapper)\|DMX]], [[Rosario Dawson]], among others, paying tribute to Aaliyah. Following her death, her single "[[Miss You (2003 song)\|Miss You]]" made it to #1 on the [[Billboard Hot 100\|US R&B Charts]]. The album earned Aaliyah a [[posthumous]] [[Grammy]] for [[Best Instrumental Arrangment Accompanying Vocals]]
	564	+
	565	+The follow-up single to "Miss You" was "[[Don't Know What to Tell Ya]]". However it was only released in Europe and peaked at #22 in the UK and #57 in Germany. The "Handcuff Remix" became popular among fans who had bought the single. The third and final Single released (second in the U.S.) was [[Come Over (Aaliyah song)\|Come Over]]. The single had moderate pop success peaking in the top 40 of The Hot 100 at #32. It did a lot better on the R&B charts becoming a top 10 hit peaking at #9.
	566	+
	567	+Shortly after, the "Greatest Hits : Special Fan Box" [http://www.amazon.co.uk/exec/obidos/ASIN/B0001GYH2A/ref=ord_cart_shr/202-2194674-4915813] was released. It featured re-packaged versions of the albums "One In A Million", "Aaliyah" and "I Care 4 U". It also featured a DVD containing all of Aaliyah's music videos. It was all packaged in a special box.
	568	+
	569	+==Aaliyah in the mid-2000s==
	570	+In early/mid-2005, four previously unreleased Aaliyah tracks were leaked to the Internet: a cover of [[Gladys Knight & the Pips\|Gladys Knight & the Pips']] "Giving Up", "Where Could He Be" featuring Missy Elliot and Tweet (which was sent to radio stations), "Steady Ground" featuring Static from Playa, and a duet with Digital Black from Playa entitled "Don't Think They Know". In January 2006, a new unreleased Aaliyah track was leaked to the Internet. Entitled "Time", it was a snippet of an unfinished song and was produced by Timbaland (Sample of this track can be found on YouTube) Buzz of a song titled "Girlfriends" has been brewing for years now since the death of Aaliyah, until recently [[Yaushameen Michael]] posted the song on her Myspace, a Duet with the late R&B Princess. There are also many other rumored unreleased tracks such as "Did You Hear", "Dont Think They Know" feat. Digital Black, "Forever in My Heart", and "Candy".
	571	+
	572	+==Merchandise and the Aaliyah Charity Fund==
	573	+Aaliyah's official website features items such as t-shirts with Aaliyah's name on them. She has had a calendar with her pictures since 2002. In 2007, Aaliyah's mother Diane Haughton and former manager Paul Allcata hired branding and licensing agency Wicked Cow Entertainment to grow the Aaliyah licensing program. Plans are currently underway for an apparel and accessories line.<ref>{{cite\|The Licensing Letter\|epmcom.com\|title=Properties Available for Licensing\|publisher=EPM\|author=The Licensing Letter\|date=2007-04-12}}</ref>
	574	+
	575	+==Discography==
	576	+{{further\|[[Aaliyah discography]]}}
	577	+
	578	+===Albums===
	579	+{\| class="wikitable"
	580	+! width=100\| Year
	581	+! width=200\| Album Title
	582	+! width=100\| U.S.
	583	+! width=100\| Worldwide
	584	+! width=100\| U.S. Charts
	585	+\|- align="center"
	586	+\| 1994 \|\| ''[[Age Ain't Nothing but a Number]]'' \|\| 3.5 million \|\| 7 million \|\| 18
	587	+\|- align="center"
	588	+\| 1996 \|\| ''[[One in a Million (album)\|One in a Million]]'' \|\| 3.7 million \|\| 11 million {{fact}} \|\| 18
	589	+\|- align="center"
	590	+\| 2001 \|\| ''[[Aaliyah (album)\|Aaliyah]]'' \|\| 2.7 million \|\| 8 million \|\| 1
	591	+\|- align="center"
	592	+\| 2002 \|\| ''[[I Care 4 U]]'' \|\| 1.5 million \|\| 6 million \|\| 3
	593	+\|- align="center"
	594	+\| 2005 \|\| ''[[Ultimate Aaliyah]]'' \|\| 0.2 million \|\| 2 million{{fact}} \|\| -
	595	+\|-
	596	+! colspan=2 \| Total \|\| 11.6 million \|\| 34 million \|\|
	597	+\|}
	598	+
	599	+=== Number-one singles ===
	600	+<!-- If it doesn't enter the singles chart, airplay charts are allowed to be putted. -->
	601	+
	602	+{{dablink\|The following singles reached number one in the [[United States\|U.S.]], the [[Hot R&B/Hip-Hop Songs\|U.S. R&B]], the [[United Kingdom]] and the [[New Zealand]]. It also includes its peak in the [[United World Chart]]}}
	603	+{\| class="wikitable"
	604	+\|- bgcolor="#CCCCCC"
	605	+!align="center" rowspan="2" \| Year
	606	+!align="center" rowspan="2" \| Single
	607	+!align="center" colspan="8" \| Peak positions
	608	+\|- bgcolor="#FFFFFF"
	609	+! width="60"\|<small>US</small>
	610	+! width="60"\|<small>US R&B</small>
	611	+! width="60"\|<small>UK</small>
	612	+! width="60"\|<small>NZ</small>
	613	+! width="60"\|<small>United World Chart</small>
	614	+\|-
	615	+\|align="center" rowspan="1"\|1994
	616	+\|align="left"\|"[[Back and Forth (song)\|Back and Forth]]"
	617	+\|align="center"\|5
	618	+\|align="center"\|'''1'''
	619	+\|align="center"\|16
	620	+\|align="center"\|18
	621	+\|align="center"\|-
	622	+\|-
	623	+\|align="center" rowspan="2"\|1996
	624	+\|align="left"\|"[[If Your Girl Only Knew]]"
	625	+\|align="center"\|11
	626	+\|align="center"\|'''1'''
	627	+\|align="center"\|15
	628	+\|align="center"\|-
	629	+\|align="center"\|-
	630	+\|-
	631	+\|align="left"\|"[[One in a Million (Aaliyah song)\|One in a Million]]"
	632	+\|align="center"\|25
	633	+\|align="center"\|'''1'''<sup>1<sup>
	634	+\|align="center"\|15
	635	+\|align="center"\|-
	636	+\|align="center"\|-
	637	+\|-
	638	+\|align="center" rowspan="1"\|1998
	639	+\|align="left"\|"[[Are You That Somebody?]]"
	640	+\|align="center"\|10
	641	+\|align="center"\|'''1'''<sup>1<sup>
	642	+\|align="center"\|11
	643	+\|align="center"\|'''1'''
	644	+\|align="center"\|-
	645	+\|-
	646	+\|align="center" rowspan="1"\|2000
	647	+\|align="left"\|"[[Try Again]]"
	648	+\|align="center"\|'''1'''
	649	+\|align="center"\|4
	650	+\|align="center"\|5
	651	+\|align="center"\|13
	652	+\|align="center"\|4
	653	+\|-
	654	+\|align="center" rowspan="1"\|2002
	655	+\|align="left"\|"[[More Than a Woman (Aaliyah song)\|More Than a Woman]]"
	656	+\|align="center"\|25
	657	+\|align="center"\|7
	658	+\|align="center"\|'''1'''
	659	+\|align="center"\|-
	660	+\|align="center"\|37
	661	+\|-
	662	+\|align="center" rowspan="1"\|2003
	663	+\|align="left"\|"[[Miss You (Aaliyah song)\|Miss You]]"
	664	+\|align="center"\|3
	665	+\|align="center"\|'''1'''
	666	+\|align="center"\|7
	667	+\|align="center"\|-
	668	+\|align="center"\|29
	669	+\|-
	670	+\|align="center"\|
	671	+!align="center"\|Total number-one singles
	672	+\|align="center"\|'''1'''
	673	+\|align="center"\|'''5'''
	674	+\|align="center"\|'''1'''
	675	+\|align="center"\|'''1'''
	676	+\|align="center"\|-
	677	+\|}
	678	+
	679	+*Notes:
	680	+<sup>1</sup> *Topped in the [[Hot R&B/Hip-Hop Airplay]]
	681	+
	682	+==Awards==
	683	+This is a list of awards for which Aaliyah was nominated during her career.
	684	+
	685	+===1995===
	686	+*1995 Nominated for an American Music Award for Favorite Soul/R&B New Artist
	687	+
	688	+*1995 Best R&B Female Vocal Performance for ''At Your Best'': Nominated
	689	+
	690	+* Nominated for Two MTV VMA's: ''Best New Artist in Video'' and '' Best R&B Video '' both for At Your Best
	691	+
	692	+* Nominated for three World Music Award: '' Worlds Best Selling Female Artist'', Worlds Best Selling New Artist'' and'' Worlds Best Selling R&B Artist''
	693	+
	694	+* Nominated for eight Billboard Music Awards: Best New R&B Artist, Best New Artist, Female Artist of the Year, Best R&B/Hip Hop single, Best Female R&B Single, Best Female Pop Single, Best Hip Hip/R&B Artist, Best R&B/Hip Hop Album
	695	+
	696	+* Source Awards: Best New Artist and Best Female R&B Artist '''WON'''
	697	+
	698	+===1999===
	699	+* 1999 Nominated for two MTV Video Music Awards: R&B Video ("Are You That Somebody?"), Best Video from a Film ("Are You That Somebody")
	700	+
	701	+* 1999 Nominated for an American Music Award for Favorite Soul/R&B Female Artist.
	702	+
	703	+* 1999 Nominated for an NAACP Image Award for Outstanding Music Video ("Are You That Somebody?")
	704	+
	705	+* 1999 Nominated for two Soul Train Lady of Soul Awards for Best R&B/Soul Song and Best R&B/Soul or Rap Music Video ("Are You That Somebody?").
	706	+
	707	+*1999 Nominated for a Grammy Award for Best Female R&B Vocal Performance ("Are You That Someobdy?")
	708	+
	709	+*1999 Nominated for a [[Academy Award]] Best Original Song for: Journey To The Past
	710	+
	711	+===2000===
	712	+*2000 Nominated for two Soul Train Lady of Soul Awards for Best RnB / Soul Single - Solo and Best RnB / Soul or Rap Music Video ("Try Again")
	713	+* 2000 '''Won''' two MTV Video Music Awards for Best Female Video, Best Video from a Film ("Try Again")
	714	+* 2000 Nominated for MTV Europe Music Video Award for Best RnB video
	715	+*2000 Nominated for Radio Music Award Urban song of the year and Urban artist of the year.
	716	+*2000 Nominated My VH1 music award nominee for double threat (Musicians-Actors) award.
	717	+
	718	+===2001===
	719	+*2001 Nominated for a Grammy Award for Best Female R&B Vocal Performance ("Try Again")
	720	+
	721	+===2002===
	722	+*2002 '''Won''' two American Music Awards: Favorite Soul/R&B Female Artist and Favorite Soul/R&B Album.
	723	+*2002 Nominated for two Grammy Awards for Best Female R&B Vocal Performance ("Rock The Boat") and Best R&B Album ("Aaliyah")
	724	+*2002 '''Won''' a Soul Train Award for R&B/Soul Single; Female ("Rock The Boat")
	725	+*2002 '''Won''' the Best R&B / Soul Single, Solo Award and R&B/Soul or Rap Song of the Year at the Soul Train Lady of Soul Awards (for "Rock The Boat")
	726	+*2002 Nominated for an MTV Video Music Award for Best R&B Video ("Rock The Boat")
	727	+===2003===
	728	+*2003 '''Won''' Source Awards: Best Female R&B Artist ''I Care 4 U''
	729	+
	730	+==Filmography==
	731	+*''[[Romeo Must Die]]'' (2000) - Trish O'Day
	732	+*''[[Queen of the Damned (film)\|Queen of the Damned]]'' (2002) - Akasha
	733	+
	734	+===Unfinished films===
	735	+
	736	+*''[[The Matrix Reloaded]]'' (2003) (replaced by [[Nona Gaye]])
	737	+*''[[The Matrix Revolutions]]'' (2003) (replaced by [[Nona Gaye]])
	738	+*''[[Honey (2003 film)\|Honey]]'' (2003) (replaced by [[Jessica Alba]])
	739	+*''[[Sparkle]]'' (replaced by [[Raven-Symoné]])
	740	+
	741	+Aaliyah was [http://www.notstarring.com/actors/aaliyah considered to play a part in other films], including:
	742	+
	743	+*''[[Charlie's Angels (film)\|Charlie's Angels]]'' (2000)
	744	+*''[[Get Over It (film)\|Get Over It]]'' (2001)
	745	+*''[[Josie and the Pussycats (film)\|Josie and the Pussycats]]'' (2001)
	746	+*''[[Osmosis Jones]]'' (2001)
	747	+
	748	+==See also==
	749	+*[[Blackground Records]]
	750	+*[[Missy Elliott]]
	751	+*[[Steve "Static" Garrett]]
	752	+*[[Rashad Haughton]]
	753	+*[[Swing Mob]]
	754	+*[[Timbaland]]
	755	+
	756	+==References==
	757	+{{reflist\|2}}
	758	+
	759	+==Links==
	760	+*[http://www.Aaliyah.com Official site]
	761	+*[http://www.aaliyahicare4u.com Artist Website]
	762	+*[http://www.myspace.com/aaliyah Aaliyah's Official Myspace Page]
	763	+*{{imdb name\|id=0004691\|name=Aaliyah}}
	764	+*{{nndb name\|id=742/000024670\|name=Aaliyah}}
	765	+*[http://www.billboard.com/bbcom/bio/index.jsp?pid=36610&cr=artist&or=ASCENDING&sf=length&kw=aaliyah Aaliyah bio on Billboard]
	766	+*[http://www.hibblenradio.com/2001-08-CBS-AaliyahCrash.mp3 MP3 of CBS News, Radio reports on crash from Abaco Island]
	767	+
	768	+<!-- PLEASE DO NOT ADD FANSITES -->
	769	+
	770	+{{Aaliyah}}
	771	+
	772	+[[Category:Aaliyah\| ]]
	773	+[[Category:American actor-singers]]
	774	+[[Category:American pop singers]]
	775	+[[Category:American dance musicians]]
	776	+[[Category:American Roman Catholics]]
	777	+[[Category:American rhythm and blues singers]]
	778	+[[Category:American female singers]]
	779	+[[Category:American film actors]]
	780	+[[Category:American dancers]]
	781	+[[Category:African-American singers]]
	782	+[[Category:African-American actors]]
	783	+[[Category:Michigan musicians]]
	784	+[[Category:Swing Mob artists]]
	785	+[[Category:People from Brooklyn]]
	786	+[[Category:People from Detroit]]
	787	+[[Category:Plane crash victims]]
	788	+[[Category:1979 births]]
	789	+[[Category:2001 deaths]]
	790	+
	791	+[[cs:Aaliyah]]
	792	+[[de:Aaliyah]]
	793	+[[et:Aaliyah]]
	794	+[[es:Aaliyah]]
	795	+[[fa:عالیه]]
	796	+[[fr:Aaliyah]]
	797	+[[gl:Aaliyah]]
	798	+[[it:Aaliyah]]
	799	+[[lt:Aaliyah]]
	800	+[[ms:Aaliyah]]
	801	+[[nl:Aaliyah]]
	802	+[[ja:アリーヤ]]
	803	+[[no:Aaliyah]]
	804	+[[pl:Aaliyah]]
	805	+[[pt:Aaliyah]]
	806	+[[ru:Аалия]]
	807	+[[simple:Aaliyah]]
	808	+[[fi:Aaliyah]]
	809	+[[sv:Aaliyah]]
	810	+[[tl:Aaliyah]]
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/CustomBoostQuery.java
—	—	@@ -0,0 +1,351 @@
	2	+package org.apache.lucene.search;
	3	+
	4	+import org.apache.lucene.search.*;
	5	+
	6	+/**
	7	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	8	+ * contributor license agreements. See the NOTICE file distributed with
	9	+ * this work for additional information regarding copyright ownership.
	10	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	11	+ * (the "License"); you may not use this file except in compliance with
	12	+ * the License. You may obtain a copy of the License at
	13	+ *
	14	+ * http://www.apache.org/licenses/LICENSE-2.0
	15	+ *
	16	+ * Unless required by applicable law or agreed to in writing, software
	17	+ * distributed under the License is distributed on an "AS IS" BASIS,
	18	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	19	+ * See the License for the specific language governing permissions and
	20	+ * limitations under the License.
	21	+ */
	22	+
	23	+import java.io.IOException;
	24	+import java.util.Set;
	25	+
	26	+import org.apache.lucene.index.IndexReader;
	27	+import org.apache.lucene.search.ComplexExplanation;
	28	+import org.apache.lucene.search.Explanation;
	29	+import org.apache.lucene.search.Query;
	30	+import org.apache.lucene.search.Scorer;
	31	+import org.apache.lucene.search.Searcher;
	32	+import org.apache.lucene.search.Similarity;
	33	+import org.apache.lucene.search.Weight;
	34	+import org.apache.lucene.util.ToStringUtils;
	35	+
	36	+/**
	37	+ * Query that sets document score as a programmatic function of (up to) two (sub) scores.
	38	+ * <ol>
	39	+ * <li>the score of its subQuery (any query)</li>
	40	+ * <li>(optional) the score of its boosting Query,
	41	+ * for most simple/convineient use case this query would be a
	42	+ * {@link org.apache.lucene.search.function.FieldScoreQuery FieldScoreQuery}</li>
	43	+ * </ol>
	44	+ * Subclasses can modify the computation by overriding {@link #customScore(int, float, float)}.
	45	+ *
	46	+ * Note: documents will only match based on the first sub scorer.
	47	+ *
	48	+ * <p><font color="#FF0000">
	49	+ * WARNING: The status of the <b>search.function</b> package is experimental.
	50	+ * The APIs introduced here might change in the future and will not be
	51	+ * supported anymore in such a case.</font>
	52	+ */
	53	+public class CustomBoostQuery extends Query {
	54	+
	55	+ private Query subQuery;
	56	+ private Query boostQuery; // optional, can be null
	57	+ private boolean strict = false; // if true, boosting part of query does not take part in weights normalization.
	58	+
	59	+ /**
	60	+ * Create a CustomBoostQuery over input subQuery.
	61	+ * @param subQuery the sub query whose scored is being customed. Must not be null.
	62	+ */
	63	+ public CustomBoostQuery(Query subQuery) {
	64	+ this(subQuery,null);
	65	+ }
	66	+
	67	+ /**
	68	+ * Create a CustomBoostQuery over input subQuery and a {@link Query}.
	69	+ * @param subQuery the sub query whose score is being customed. Must not be null.
	70	+ * @param boostQuery a value source query whose scores are used in the custom score
	71	+ * computation. For most simple/convineient use case this would be a
	72	+ * {@link org.apache.lucene.search.function.FieldScoreQuery FieldScoreQuery}.
	73	+ * This parameter is optional - it can be null.
	74	+ */
	75	+ public CustomBoostQuery(Query subQuery, Query boostQuery) {
	76	+ super();
	77	+ this.subQuery = subQuery;
	78	+ this.boostQuery = boostQuery;
	79	+ if (subQuery == null) throw new IllegalArgumentException("<subqyery> must not be null!");
	80	+ }
	81	+
	82	+ /(non-Javadoc) @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) /
	83	+ public Query rewrite(IndexReader reader) throws IOException {
	84	+ subQuery = subQuery.rewrite(reader);
	85	+ if (boostQuery!=null) {
	86	+ boostQuery = (Query) boostQuery.rewrite(reader);
	87	+ }
	88	+ return this;
	89	+ }
	90	+
	91	+ /(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(java.util.Set) /
	92	+ public void extractTerms(Set terms) {
	93	+ subQuery.extractTerms(terms);
	94	+ if (boostQuery!=null) {
	95	+ boostQuery.extractTerms(terms);
	96	+ }
	97	+ }
	98	+
	99	+ /(non-Javadoc) @see org.apache.lucene.search.Query#clone() /
	100	+ public Object clone() {
	101	+ CustomBoostQuery clone = (CustomBoostQuery)super.clone();
	102	+ clone.subQuery = (Query) subQuery.clone();
	103	+ if (boostQuery!=null) {
	104	+ clone.boostQuery = (Query) boostQuery.clone();
	105	+ }
	106	+ return clone;
	107	+ }
	108	+
	109	+ /* (non-Javadoc) @see org.apache.lucene.search.Query#toString(java.lang.String) */
	110	+ public String toString(String field) {
	111	+ StringBuffer sb = new StringBuffer(name()).append("(");
	112	+ sb.append(subQuery.toString(field));
	113	+ if (boostQuery!=null) {
	114	+ sb.append(", ").append(boostQuery.toString(field));
	115	+ }
	116	+ sb.append(")");
	117	+ sb.append(strict?" STRICT" : "");
	118	+ return sb.toString() + ToStringUtils.boost(getBoost());
	119	+ }
	120	+
	121	+ /** Returns true if <code>o</code> is equal to this. */
	122	+ public boolean equals(Object o) {
	123	+ if (getClass() != o.getClass()) {
	124	+ return false;
	125	+ }
	126	+ CustomBoostQuery other = (CustomBoostQuery)o;
	127	+ return this.getBoost() == other.getBoost()
	128	+ && this.subQuery.equals(other.subQuery)
	129	+ && (this.boostQuery==null ? other.boostQuery==null
	130	+ : this.boostQuery.equals(other.boostQuery));
	131	+ }
	132	+
	133	+ /** Returns a hash code value for this object. */
	134	+ public int hashCode() {
	135	+ int boostHash = boostQuery==null ? 0 : boostQuery.hashCode();
	136	+ return (getClass().hashCode() + subQuery.hashCode() + boostHash) ^ Float.floatToIntBits(getBoost());
	137	+ }
	138	+
	139	+ /**
	140	+ * Compute a custom score by the subQuery score and the Query score.
	141	+ * <p>
	142	+ * Subclasses can override this method to modify the custom score.
	143	+ * <p>
	144	+ * The default computation herein is:
	145	+ * <pre>
	146	+ * ModifiedScore = boostScore * subQueryScore.
	147	+ * </pre>
	148	+ *
	149	+ * @param doc id of scored doc.
	150	+ * @param subQueryScore score of that doc by the subQuery.
	151	+ * @param boostScore score of that doc by the Query.
	152	+ * @return custom score.
	153	+ */
	154	+ public float customScore(int doc, float subQueryScore, float boostScore) {
	155	+ return (0.2f + boostScore * 10) * subQueryScore;
	156	+ }
	157	+
	158	+ /**
	159	+ * Explain the custom score.
	160	+ * Whenever overriding {@link #customScore(int, float, float)},
	161	+ * this method should also be overriden to provide the correct explanation
	162	+ * for the part of the custom scoring.
	163	+ * @param doc doc being explained.
	164	+ * @param subQueryExpl explanation for the sub-query part.
	165	+ * @param boostExpl explanation for the value source part.
	166	+ * @return an explanation for the custom score
	167	+ */
	168	+ public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation boostExpl) {
	169	+ float boostScore = boostExpl==null ? 1 : boostExpl.getValue();
	170	+ float sc = (0.2f + boostScore * 10);
	171	+ Explanation exp = new Explanation( sc * subQueryExpl.getValue(), "custom score: product of:");
	172	+ exp.addDetail(subQueryExpl);
	173	+ if (boostExpl != null) {
	174	+ exp.addDetail(boostExpl);
	175	+ }
	176	+ return exp;
	177	+ }
	178	+ //=========================== W E I G H T ============================
	179	+
	180	+ private class CustomWeight implements Weight {
	181	+ Searcher searcher;
	182	+ Weight subQueryWeight;
	183	+ Weight boostWeight; // optional
	184	+ boolean qStrict;
	185	+
	186	+ public CustomWeight(Searcher searcher) throws IOException {
	187	+ this.searcher = searcher;
	188	+ this.subQueryWeight = subQuery.weight(searcher);
	189	+ if (boostQuery!=null) {
	190	+ this.boostWeight = boostQuery.createWeight(searcher);
	191	+ }
	192	+ this.qStrict = strict;
	193	+ }
	194	+
	195	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#getQuery() /
	196	+ public Query getQuery() {
	197	+ return CustomBoostQuery.this;
	198	+ }
	199	+
	200	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() /
	201	+ public float getValue() {
	202	+ return getBoost();
	203	+ }
	204	+
	205	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() /
	206	+ public float sumOfSquaredWeights() throws IOException {
	207	+ float sum = subQueryWeight.sumOfSquaredWeights();
	208	+ if (boostWeight!=null) {
	209	+ if (qStrict) {
	210	+ boostWeight.sumOfSquaredWeights(); // do not include ValueSource part in the query normalization
	211	+ } else {
	212	+ sum += boostWeight.sumOfSquaredWeights();
	213	+ }
	214	+ }
	215	+ sum = getBoost() getBoost(); // boost each sub-weight
	216	+ return sum ;
	217	+ }
	218	+
	219	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) /
	220	+ public void normalize(float norm) {
	221	+ norm *= getBoost(); // incorporate boost
	222	+ subQueryWeight.normalize(norm);
	223	+ if (boostWeight!=null) {
	224	+ if (qStrict) {
	225	+ boostWeight.normalize(1); // do not normalize the ValueSource part
	226	+ } else {
	227	+ boostWeight.normalize(norm);
	228	+ }
	229	+ }
	230	+ }
	231	+
	232	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#scorer(org.apache.lucene.index.IndexReader) /
	233	+ public Scorer scorer(IndexReader reader) throws IOException {
	234	+ Scorer subQueryScorer = subQueryWeight.scorer(reader);
	235	+ Scorer boostScorer = (boostWeight==null ? null : boostWeight.scorer(reader));
	236	+ return new CustomScorer(getSimilarity(searcher), reader, this, subQueryScorer, boostScorer);
	237	+ }
	238	+
	239	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#explain(org.apache.lucene.index.IndexReader, int) /
	240	+ public Explanation explain(IndexReader reader, int doc) throws IOException {
	241	+ return scorer(reader).explain(doc);
	242	+ }
	243	+ }
	244	+
	245	+
	246	+ //=========================== S C O R E R ============================
	247	+
	248	+ /**
	249	+ * A scorer that applies a (callback) function on scores of the subQuery.
	250	+ */
	251	+ private class CustomScorer extends Scorer {
	252	+ private final CustomWeight weight;
	253	+ private final float qWeight;
	254	+ private Scorer subQueryScorer;
	255	+ private Scorer boostScorer; // optional
	256	+ private IndexReader reader;
	257	+
	258	+ // constructor
	259	+ private CustomScorer(Similarity similarity, IndexReader reader, CustomWeight w,
	260	+ Scorer subQueryScorer, Scorer boostScorer) throws IOException {
	261	+ super(similarity);
	262	+ this.weight = w;
	263	+ this.qWeight = w.getValue();
	264	+ this.subQueryScorer = subQueryScorer;
	265	+ this.boostScorer = boostScorer;
	266	+ this.reader = reader;
	267	+ }
	268	+
	269	+ /(non-Javadoc) @see org.apache.lucene.search.Scorer#next() /
	270	+ public boolean next() throws IOException {
	271	+ boolean hasNext = subQueryScorer.next();
	272	+ if (boostScorer!=null && hasNext) {
	273	+ boostScorer.skipTo(subQueryScorer.doc());
	274	+ }
	275	+ return hasNext;
	276	+ }
	277	+
	278	+ /(non-Javadoc) @see org.apache.lucene.search.Scorer#doc() /
	279	+ public int doc() {
	280	+ return subQueryScorer.doc();
	281	+ }
	282	+
	283	+ /(non-Javadoc) @see org.apache.lucene.search.Scorer#score() /
	284	+ public float score() throws IOException {
	285	+ float boostScore = (boostScorer==null \|\| subQueryScorer.doc() != boostScorer.doc() ? 0 : boostScorer.score());
	286	+ return qWeight * customScore(subQueryScorer.doc(), subQueryScorer.score(), boostScore);
	287	+ }
	288	+
	289	+ /(non-Javadoc) @see org.apache.lucene.search.Scorer#skipTo(int) /
	290	+ public boolean skipTo(int target) throws IOException {
	291	+ boolean hasNext = subQueryScorer.skipTo(target);
	292	+ if (boostScorer!=null && hasNext) {
	293	+ boostScorer.skipTo(subQueryScorer.doc());
	294	+ }
	295	+ return hasNext;
	296	+ }
	297	+
	298	+ /(non-Javadoc) @see org.apache.lucene.search.Scorer#explain(int) /
	299	+ public Explanation explain(int doc) throws IOException {
	300	+ Explanation subQueryExpl = weight.subQueryWeight.explain(reader,doc);
	301	+ if (!subQueryExpl.isMatch()) {
	302	+ return subQueryExpl;
	303	+ }
	304	+ // match
	305	+ Explanation boostExpl = boostScorer==null ? null :
	306	+ weight.qStrict ? boostScorer.explain(doc) : weight.boostWeight.explain(reader,doc);
	307	+ Explanation customExp = customExplain(doc,subQueryExpl,boostExpl);
	308	+ float sc = qWeight * customExp.getValue();
	309	+ Explanation res = new ComplexExplanation(
	310	+ true, sc, CustomBoostQuery.this.toString() + ", product of:");
	311	+ res.addDetail(customExp);
	312	+ res.addDetail(new Explanation(qWeight, "queryBoost")); // actually using the q boost as q weight (== weight value)
	313	+ return res;
	314	+ }
	315	+ }
	316	+
	317	+ /(non-Javadoc) @see org.apache.lucene.search.Query#createWeight(org.apache.lucene.search.Searcher) /
	318	+ protected Weight createWeight(Searcher searcher) throws IOException {
	319	+ return new CustomWeight(searcher);
	320	+ }
	321	+
	322	+ /**
	323	+ * Checks if this is strict custom scoring.
	324	+ * In strict custom scoring, the ValueSource part of does not participate in weight normalization.
	325	+ * This may be useful when one wants full control over how scores are modified, and does
	326	+ * not care about normalizing by the ValueSource part.
	327	+ * One particular case where this is useful if for testing this query.
	328	+ * <P>
	329	+ * Note: only has effect when the ValueSource part is not null.
	330	+ */
	331	+ public boolean isStrict() {
	332	+ return strict;
	333	+ }
	334	+
	335	+ /**
	336	+ * Set the strict mode of this query.
	337	+ * @param strict The strict mode to set.
	338	+ * @see #isStrict()
	339	+ */
	340	+ public void setStrict(boolean strict) {
	341	+ this.strict = strict;
	342	+ }
	343	+
	344	+ /**
	345	+ * A short name of this query, used in {@link #toString(String)}.
	346	+ */
	347	+ public String name() {
	348	+ return "custom";
	349	+ }
	350	+
	351	+}
	352	+
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/ResultSet.java
—	—	@@ -1,6 +1,8 @@
2	2	package org.wikimedia.lsearch.beans;
3	3
4	4	import java.io.Serializable;
	5	+import java.util.ArrayList;
	6	+import java.util.Collection;
5	7
6	8	import org.apache.lucene.search.Explanation;
7	9
—	—	@@ -9,6 +11,7 @@
10	12	public double score;
11	13	public String namespace;
12	14	public String title;
	15	+ public ArrayList<String> context;
13	16	Explanation explanation;
14	17
15	18	public ResultSet(String key) {
—	—	@@ -43,7 +46,25 @@
44	47	@Override
45	48	public String toString() {
46	49	return score+" "+namespace+":"+title+(explanation==null? "" : "\n"+explanation);
47		~~- }~~
	50	+ }
48	51
	52	+ public void addContext(Collection<String> texts){
	53	+ if(texts == null)
	54	+ return;
	55	+ for(String t : texts)
	56	+ addContext(t);
	57	+ }
49	58
	59	+ public void addContext(String text){
	60	+ if(context == null)
	61	+ context = new ArrayList<String>();
	62	+
	63	+ context.add(text.replace('\n',' '));
	64	+ }
	65	+
	66	+ public ArrayList<String> getContext(){
	67	+ return context;
	68	+ }
	69	+
	70	+
50	71	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Title.java
—	—	@@ -17,11 +17,11 @@
18	18	}
19	19
20	20	public Title(String key){
21		~~- String[] parts = key.split(":",2);~~
22		~~- if(parts.length != 2)~~
	21	+ int col = key.indexOf(':');
	22	+ if(col == -1)
23	23	throw new RuntimeException("Wrong key format in Title constructor");
24		~~- this.namespace = Integer.parseInt(parts[0]);~~
25		~~- this.title = parts[1];~~
	24	+ this.namespace = Integer.parseInt(key.substring(0,col));
	25	+ this.title = key.substring(col+1);
26	26	}
27	27
28	28	public String getKey(){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java
—	—	@@ -119,7 +119,7 @@
120	120	selected.add(sorted.get(i).getKey());
121	121	}
122	122	Document d = new Document();
123		~~- d.add(new Field("prefix",prefix,Field.Store.NO,Field.Index.UN_TOKENIZED));~~
	123	+ d.add(new Field("prefix",prefix,Field.Store.NO,Field.Index.NO_NORMS));
124	124	d.add(new Field("articles",new StringList(selected).toString(),Field.Store.YES,Field.Index.NO));
125	125	writer.addDocument(d);
126	126	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java
—	—	@@ -38,17 +38,17 @@
39	39	Revision revision;
40	40	SimpleIndexWriter writer;
41	41	int count = 0, limit;
42		~~- LinkAnalysisStorage las;~~
	42	+ Links links;
43	43	String langCode;
44	44	RelatedStorage related;
45	45
46	46	public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor,
47		~~- Integer maxBufDocs, boolean newIndex, LinkAnalysisStorage las, String langCode){~~
	47	+ Integer maxBufDocs, boolean newIndex, Links links, String langCode){
48	48	Configuration.open(); // make sure configuration is loaded
49	49	IndexId iid = IndexId.get(dbname);
50		~~- writer = new SimpleIndexWriter(iid, optimize, mergeFactor, maxBufDocs, newIndex);~~
	50	+ writer = new SimpleIndexWriter(links, iid, optimize, mergeFactor, maxBufDocs, newIndex);
51	51	this.limit = limit;
52		~~- this.las = las;~~
	52	+ this.links = links;
53	53	this.langCode = langCode;
54	54	this.related = new RelatedStorage(iid);
55	55	if(!related.canRead())
—	—	@@ -62,26 +62,29 @@
63	63	}
64	64	public void writeEndPage() throws IOException {
65	65	String key = page.Title.Namespace+":"+page.Title.Text;
66		~~- ArticleAnalytics aa = las.getAnaliticsForArticle(key);~~
67		~~- int references = aa.getReferences();~~
68		~~- boolean isRedirect = aa.isRedirect();~~
69		~~- int redirectTargetNamespace = aa.getRedirectTargetNamespace();~~
	66	+ int references = links.getNumInLinks(key);
	67	+ boolean isRedirect = links.isRedirect(key);
	68	+ int redirectTargetNamespace = isRedirect? links.getRedirectTargetNamespace(key) : -1;
70	69
71	70	// make list of redirects
72	71	ArrayList<Redirect> redirects = new ArrayList<Redirect>();
73	72	ArrayList<String> anchors = new ArrayList<String>();
74		~~- anchors.addAll(aa.getAnchorText());~~
75		~~- for(String rk : aa.getRedirectKeys()){~~
	73	+ //anchors.addAll(aa.getAnchorText());
	74	+ for(String rk : links.getRedirectsTo(key)){
76	75	String[] parts = rk.toString().split(":",2);
77		~~- ArticleAnalytics raa = las.getAnaliticsForReferences(rk);~~
78		~~- redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],raa.getReferences()));~~
79		~~- anchors.addAll(raa.getAnchorText());~~
	76	+ int redirectRef = links.getNumInLinks(rk);
	77	+ redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],redirectRef));
	78	+ //anchors.addAll(raa.getAnchorText());
80	79	}
81	80	ArrayList<RelatedTitle> rel = null;
82	81	if(related != null)
83	82	rel = related.getRelated(key);
84	83	else
85	84	rel = new ArrayList<RelatedTitle>();
	85	+ // extract contexts
	86	+ /*for(RelatedTitle t : rel){
	87	+ links.getContext(t.getRelated().getKey(),key);
	88	+ } */
86	89	// make article
87	90	Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,
88	91	references,redirectTargetNamespace,redirects,rel,anchors);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
—	—	@@ -17,6 +17,7 @@
18	18	import org.wikimedia.lsearch.index.IndexUpdateRecord;
19	19	import org.wikimedia.lsearch.index.WikiIndexModifier;
20	20	import org.wikimedia.lsearch.index.WikiSimilarity;
	21	+import org.wikimedia.lsearch.ranks.Links;
21	22
22	23	/**
23	24	* IndexWriter for building indexes from scratch.
—	—	@@ -33,8 +34,10 @@
34	35	protected Integer mergeFactor, maxBufDocs;
35	36	protected boolean newIndex;
36	37	protected String langCode;
	38	+ protected Links links;
37	39
38		~~- public SimpleIndexWriter(IndexId iid, Boolean optimize, Integer mergeFactor, Integer maxBufDocs, boolean newIndex){~~
	40	+ public SimpleIndexWriter(Links links, IndexId iid, Boolean optimize, Integer mergeFactor, Integer maxBufDocs, boolean newIndex){
	41	+ this.links = links;
39	42	this.iid = iid;
40	43	this.optimize = optimize;
41	44	this.mergeFactor = mergeFactor;
—	—	@@ -43,7 +46,7 @@
44	47	GlobalConfiguration global = GlobalConfiguration.getInstance();
45	48	langCode = global.getLanguage(iid.getDBname());
46	49	FieldBuilder.Case dCase = (global.exactCaseIndex(iid.getDBname()))? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
47		~~- builder = new FieldBuilder(langCode,dCase);~~
	50	+ builder = new FieldBuilder(iid,dCase);
48	51	indexes = new HashMap<String,IndexWriter>();
49	52	// open all relevant indexes
50	53	if(iid.isSingle())
—	—	@@ -109,7 +112,7 @@
110	113	IndexWriter writer = indexes.get(target.toString());
111	114	if(writer == null)
112	115	return;
113		~~- Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid);~~
	116	+ Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid,links);
114	117	Document doc = (Document) ret[0];
115	118	Analyzer analyzer = (Analyzer) ret[1];
116	119	try {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java
—	—	@@ -20,6 +20,7 @@
21	21	import org.wikimedia.lsearch.ranks.Links;
22	22	import org.wikimedia.lsearch.ranks.RankBuilder;
23	23	import org.wikimedia.lsearch.related.CompactLinks;
	24	+import org.wikimedia.lsearch.related.RelatedBuilder;
24	25	import org.wikimedia.lsearch.storage.LinkAnalysisStorage;
25	26	import org.wikimedia.lsearch.storage.Storage;
26	27	import org.wikimedia.lsearch.util.Localization;
—	—	@@ -45,6 +46,7 @@
46	47	Integer mergeFactor = null, maxBufDocs = null;
47	48	boolean newIndex = true, makeSnapshot = false;
48	49	boolean snapshotDb = false, useOldLinkAnalysis = false;
	50	+ boolean useOldRelated = false;
49	51
50	52	System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n");
51	53
—	—	@@ -52,12 +54,13 @@
53	55	log = Logger.getLogger(Importer.class);
54	56
55	57	if(args.length < 2){
56		~~- System.out.println("Syntax: java Importer [-a] [-n] [-s] [-la] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");~~
	58	+ System.out.println("Syntax: java Importer [-a] [-n] [-s] [-l] [-r] [-lm limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");
57	59	System.out.println("Options: ");
58	60	System.out.println(" -a - don't create new index, append to old");
59	61	System.out.println(" -s - make index snapshot when finished");
60		~~- System.out.println(" -la - use earlier link analysis index, don't recalculate");~~
61		~~- System.out.println(" -l limit_num - add at most limit_num articles");~~
	62	+ System.out.println(" -l - use earlier link analysis index, don't recalculate");
	63	+ System.out.println(" -r - use earlier related index, don't recalculate");
	64	+ System.out.println(" -lm limit_num - add at most limit_num articles");
62	65	System.out.println(" -o optimize - true/false overrides optimization param from global settings");
63	66	System.out.println(" -m mergeFactor - overrides param from global settings");
64	67	System.out.println(" -b maxBufDocs - overrides param from global settings");
—	—	@@ -65,7 +68,7 @@
66	69	return;
67	70	}
68	71	for(int i=0;i<args.length;i++){
69		~~- if(args[i].equals("-l"))~~
	72	+ if(args[i].equals("-lm"))
70	73	limit = Integer.parseInt(args[++i]);
71	74	else if(args[i].equals("-o"))
72	75	optimize = Boolean.parseBoolean(args[++i]);
—	—	@@ -75,8 +78,10 @@
76	79	maxBufDocs = Integer.parseInt(args[++i]);
77	80	else if(args[i].equals("-a"))
78	81	newIndex = false;
79		~~- else if(args[i].equals("-la"))~~
	82	+ else if(args[i].equals("-l"))
80	83	useOldLinkAnalysis = true;
	84	+ else if(args[i].equals("-r"))
	85	+ useOldRelated = true;
81	86	else if(args[i].equals("-s"))
82	87	makeSnapshot = true;
83	88	else if(args[i].equals("--snapshot")){
—	—	@@ -106,17 +111,23 @@
107	112	long start = System.currentTimeMillis();
108	113
109	114	if(!useOldLinkAnalysis){
110		~~- // regenerate link and redirect information~~
111		~~- Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode,iid),langCode);~~
	115	+ // regenerate link and redirect information
112	116	try {
113		~~- RankBuilder.storeLinkAnalysis(links,iid);~~
	117	+ RankBuilder.processLinks(inputfile,Links.createNew(iid),iid,langCode);
114	118	} catch (IOException e) {
115	119	log.fatal("Cannot store link analytics: "+e.getMessage());
116	120	return;
117	121	}
118	122	}
119		~~- log.info("Third pass, indexing articles...");~~
120		-
	123	+ if(!useOldRelated){
	124	+ try {
	125	+ RelatedBuilder.rebuildFromLinks(iid);
	126	+ } catch (IOException e) {
	127	+ log.fatal("Cannot make related mapping: "+e.getMessage());
	128	+ return;
	129	+ }
	130	+ }
	131	+
121	132	// open
122	133	InputStream input = null;
123	134	try {
—	—	@@ -124,31 +135,29 @@
125	136	} catch (IOException e) {
126	137	log.fatal("I/O error opening "+inputfile);
127	138	return;
128		~~- }~~
129		~~- LinkAnalysisStorage las = new LinkAnalysisStorage(iid);~~
130		~~- // read~~
131		~~- DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,las,langCode);~~
132		~~- XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 1000));~~
	139	+ }
	140	+ long end = start;
133	141	try {
	142	+ log.info("Indexing articles...");
	143	+ IndexId ll = iid.getLinks();
	144	+ Links links = Links.openForRead(ll,ll.getImportPath());
	145	+ // read
	146	+ DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,links,langCode);
	147	+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 1000));
134	148	reader.readDump();
	149	+ log.info("Closing/optimizing index...");
	150	+ dp.closeIndex();
	151	+ end = System.currentTimeMillis();
	152	+ System.out.println("Cache stats: "+links.getCache().getStats());
135	153	} catch (IOException e) {
136	154	if(!e.getMessage().equals("stopped")){
137		~~- log.fatal("I/O error reading dump for "+dbname+" from "+inputfile);~~
	155	+ log.fatal("I/O error processing dump for "+dbname+" from "+inputfile+" : "+e.getMessage());
	156	+ e.printStackTrace();
138	157	return;
139	158	}
140		~~- }~~
141		-
142		~~- long end = System.currentTimeMillis();~~
143		-
144		~~- log.info("Closing/optimizing index...");~~
145		~~- try{~~
146		~~- dp.closeIndex();~~
147		~~- } catch(IOException e){~~
148		~~- e.printStackTrace();~~
149		~~- log.fatal("Cannot close/optimize index : "+e.getMessage());~~
150	159	System.exit(1);
151	160	}
152		-
	161	+
153	162	long finalEnd = System.currentTimeMillis();
154	163
155	164	System.out.println("Finished indexing in "+formatTime(end-start)+", with final index optimization in "+formatTime(finalEnd-end));
—	—	@@ -168,6 +177,16 @@
169	178	} else
170	179	IndexThread.makeIndexSnapshot(iid,iid.getImportPath());
171	180	}
	181	+
	182	+ // some cache stats
	183	+ /*Cache cache = CacheManager.create().getCache("links");
	184	+ Statistics s = cache.getStatistics();
	185	+
	186	+ long hit = s.getCacheHits();
	187	+ long miss = s.getCacheMisses();
	188	+
	189	+ System.out.println("Cache stats: hits = "+hit+", miss = "+miss); */
	190	+
172	191	}
173	192
174	193	private static String formatTime(long l) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/RelatedStorage.java
—	—	@@ -35,7 +35,7 @@
36	36	StringList sl = new StringList(CompactRelated.convertToStringList(rel));
37	37	Document doc = new Document();
38	38	doc.add(new Field("key",key,Field.Store.YES,Field.Index.UN_TOKENIZED));
39		~~- doc.add(new Field("related",sl.toString(),Field.Store.YES,Field.Index.NO));~~
	39	+ doc.add(new Field("related",sl.toString(),Field.Store.COMPRESS,Field.Index.NO));
40	40	writer.addDocument(doc);
41	41	}
42	42
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java
—	—	@@ -32,17 +32,20 @@
33	33	* @author rainman
34	34	*
35	35	*/
	36	+@Deprecated
36	37	public class LinkAnalysisStorage extends LuceneStorage {
37	38	static Logger log = Logger.getLogger(LinkAnalysisStorage.class);
38	39	protected SetBasedFieldSelector selRef;
39	40
40	41	public LinkAnalysisStorage(IndexId iid){
41		~~- super(iid.getLinkAnalysis());~~
	42	+ //super(iid.getLinkAnalysis());
	43	+ super(iid);
42	44	init();
43	45	}
44	46
45	47	public LinkAnalysisStorage(IndexId iid, String path){
46		~~- super(iid.getLinkAnalysis(),path);~~
	48	+ //super(iid.getLinkAnalysis(),path);
	49	+ super(iid,path);
47	50	init();
48	51	}
49	52
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java
—	—	@@ -58,7 +58,7 @@
59	59	/** If true, this machine is an indexer for this index */
60	60	protected boolean myIndex;
61	61
62		~~- protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINK_ANALYSIS, RELATED, PREFIX };~~
	62	+ protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINKS, RELATED, PREFIX, PREFIX_TITLES };
63	63
64	64	/** Type of index, enumeration */
65	65	protected IndexType type;
—	—	@@ -95,6 +95,9 @@
96	96	protected String OAIRepository;
97	97
98	98	protected String rsyncSnapshotPath = null;
	99	+
	100	+ /** language code, e.g. "en" */
	101	+ protected String langCode = null;
99	102
100	103	/**
101	104	* Get index Id object given it's string representation, the actual object
—	—	@@ -105,7 +108,10 @@
106	109	* @return
107	110	*/
108	111	static public IndexId get(String dbrole){
109		~~- return GlobalConfiguration.getIndexId(dbrole);~~
	112	+ IndexId ret = GlobalConfiguration.getIndexId(dbrole);
	113	+ if(ret == null)
	114	+ throw new RuntimeException("Index "+dbrole+" doesn't exist");
	115	+ return ret;
110	116	}
111	117
112	118	/**
—	—	@@ -158,12 +164,14 @@
159	165	this.type = IndexType.NSSPLIT;
160	166	else if(type.equals("spell"))
161	167	this.type = IndexType.SPELL;
162		~~- else if(type.equals("link_analysis"))~~
163		~~- this.type = IndexType.LINK_ANALYSIS;~~
	168	+ else if(type.equals("links"))
	169	+ this.type = IndexType.LINKS;
164	170	else if(type.equals("related"))
165	171	this.type = IndexType.RELATED;
166	172	else if(type.equals("prefix"))
167	173	this.type = IndexType.PREFIX;
	174	+ else if(type.equals("prefix_titles"))
	175	+ this.type = IndexType.PREFIX_TITLES;
168	176
169	177	// parts
170	178	String[] parts = dbrole.split("\\.");
—	—	@@ -259,9 +267,9 @@
260	268	public boolean isSpell(){
261	269	return type == IndexType.SPELL;
262	270	}
263		~~- /** If this is the link-analysis index */~~
264		~~- public boolean isLinkAnalysis(){~~
265		~~- return type == IndexType.LINK_ANALYSIS;~~
	271	+ /** If this is the index storing pagelinks */
	272	+ public boolean isLinks(){
	273	+ return type == IndexType.LINKS;
266	274	}
267	275	/** If this is the index storing info about related articles */
268	276	public boolean isRelated(){
—	—	@@ -271,6 +279,10 @@
272	280	public boolean isPrefix(){
273	281	return type == IndexType.PREFIX;
274	282	}
	283	+ /** If this is the index storing titles for the prefix index */
	284	+ public boolean isPrefixTitles(){
	285	+ return type == IndexType.PREFIX_TITLES;
	286	+ }
275	287
276	288	/** If this is a split index, returns the current part number, e.g. for entest.part4 will return 4 */
277	289	public int getPartNum() {
—	—	@@ -418,7 +430,7 @@
419	431
420	432	/** get all hosts that search db this iid belongs to */
421	433	public HashSet<String> getDBSearchHosts(){
422		~~- if(isSingle() \|\| isSpell() \|\| isLinkAnalysis() \|\| isRelated() \|\| isPrefix())~~
	434	+ if(isSingle() \|\| isSpell() \|\| isLinks() \|\| isRelated() \|\| isPrefix() \|\| isPrefixTitles())
423	435	return searchHosts;
424	436	else{
425	437	// add all hosts that search: dbname and all parts
—	—	@@ -469,7 +481,7 @@
470	482	*/
471	483	public HashSet<String> getPhysicalIndexes() {
472	484	HashSet<String> ret = new HashSet<String>();
473		~~- if(isSingle() \|\| isSpell() \|\| isLinkAnalysis() \|\| isRelated() \|\| isPrefix())~~
	485	+ if(isSingle() \|\| isSpell() \|\| isLinks() \|\| isRelated() \|\| isPrefix() \|\| isPrefixTitles())
474	486	ret.add(dbrole);
475	487	else if(isMainsplit() \|\| isSplit() \|\| isNssplit()){
476	488	for(String p : splitParts)
—	—	@@ -545,9 +557,9 @@
546	558	return get(dbname+".spell");
547	559	}
548	560
549		~~- /** Get the link analysis iid */~~
550		~~- public IndexId getLinkAnalysis() {~~
551		~~- return get(dbname+".link_analysis");~~
	561	+ /** Get the pagelinks iid */
	562	+ public IndexId getLinks() {
	563	+ return get(dbname+".links");
552	564	}
553	565
554	566	/** Get the related-articles index iid */
—	—	@@ -560,6 +572,17 @@
561	573	return get(dbname+".prefix");
562	574	}
563	575
	576	+ /** Get the prefix titles index iid */
	577	+ public IndexId getPrefixTitles() {
	578	+ return get(dbname+".prefix_titles");
	579	+ }
564	580
	581	+ /** Get language code for this db, e.g. "en" */
	582	+ public String getLangCode(){
	583	+ if(langCode == null)
	584	+ langCode = GlobalConfiguration.getInstance().getLanguage(dbname);
	585	+ return langCode;
	586	+ }
	587	+
565	588
566	589	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
—	—	@@ -186,10 +186,12 @@
187	187	}
188	188	}
189	189	// add the link analysis to indexers
190		~~- if(!types.contains("link_analysis"))~~
191		~~- database.get(dbname).put("link_analysis",new Hashtable<String,String>());~~
	190	+ if(!types.contains("links"))
	191	+ database.get(dbname).put("links",new Hashtable<String,String>());
192	192	if(!types.contains("related"))
193	193	database.get(dbname).put("related",new Hashtable<String,String>());
	194	+ if(!types.contains("prefix_titles"))
	195	+ database.get(dbname).put("prefix_titles",new Hashtable<String,String>());
194	196	}
195	197	// expand logical index names on searchers
196	198	for(String host : search.keySet()){
—	—	@@ -232,7 +234,7 @@
233	235	} else if(typeid.matches("nspart[1-9][0-9]*")){
234	236	type = "nssplit";
235	237	dbrole = dbname + "." + typeid;
236		~~- } else if(typeid.equals("spell") \|\| typeid.equals("link_analysis") \|\| typeid.equals("related") \|\| typeid.equals("prefix")){~~
	238	+ } else if(typeid.equals("spell") \|\| typeid.equals("links") \|\| typeid.equals("related") \|\| typeid.equals("prefix") \|\| typeid.equals("prefix_titles")){
237	239	type = typeid;
238	240	dbrole = dbname + "." + typeid;
239	241	} else
—	—	@@ -252,7 +254,7 @@
253	255	}
254	256	boolean searched = (getSearchHosts(dbrole).size() != 0);
255	257	if(!searched && !(typeid.equals("mainsplit") \|\| typeid.equals("split")
256		~~- \|\| typeid.equals("nssplit") \|\| typeid.equals("link_analysis") \|\| typeid.equals("related"))){~~
	258	+ \|\| typeid.equals("nssplit") \|\| typeid.equals("links") \|\| typeid.equals("related") \|\| typeid.equals("prefix_titles"))){
257	259	if(verbose)
258	260	System.out.println("WARNING: in Global Configuration: index "+dbrole+" is not searched by any host.");
259	261	}
—	—	@@ -519,7 +521,7 @@
520	522	} else if(typeid.matches("nspart[1-9][0-9]*")){
521	523	type = "nssplit";
522	524	dbrole = dbname + "." + typeid;
523		~~- } else if(typeid.equals("spell") \|\| typeid.equals("link_analysis") \|\| typeid.equals("related") \|\| typeid.equals("prefix")){~~
	525	+ } else if(typeid.equals("spell") \|\| typeid.equals("links") \|\| typeid.equals("related") \|\| typeid.equals("prefix") \|\| typeid.equals("prefix_titles")){
524	526	type = typeid;
525	527	dbrole = dbname + "." + typeid;
526	528	} else
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
—	—	@@ -5,6 +5,7 @@
6	6	import java.net.URI;
7	7	import java.text.MessageFormat;
8	8	import java.util.ArrayList;
	9	+import java.util.Collection;
9	10	import java.util.HashMap;
10	11	import java.util.HashSet;
11	12	import java.util.Hashtable;
—	—	@@ -31,14 +32,19 @@
32	33	import org.wikimedia.lsearch.analyzers.WikiQueryParser;
33	34	import org.wikimedia.lsearch.beans.ResultSet;
34	35	import org.wikimedia.lsearch.beans.SearchResults;
	36	+import org.wikimedia.lsearch.beans.Title;
35	37	import org.wikimedia.lsearch.config.GlobalConfiguration;
36	38	import org.wikimedia.lsearch.config.IndexId;
37	39	import org.wikimedia.lsearch.frontend.SearchDaemon;
38	40	import org.wikimedia.lsearch.frontend.SearchServer;
39	41	import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
	42	+import org.wikimedia.lsearch.ranks.Links;
40	43	import org.wikimedia.lsearch.ranks.StringList;
	44	+import org.wikimedia.lsearch.related.Related;
	45	+import org.wikimedia.lsearch.related.RelatedTitle;
41	46	import org.wikimedia.lsearch.spell.Suggest;
42	47	import org.wikimedia.lsearch.spell.SuggestQuery;
	48	+import org.wikimedia.lsearch.util.Localization;
43	49	import org.wikimedia.lsearch.util.QueryStringMap;
44	50
45	51	/**
—	—	@@ -54,6 +60,7 @@
55	61	protected final int maxlines = 1000;
56	62	protected final int maxoffset = 10000;
57	63	protected static GlobalConfiguration global = null;
	64	+ protected static Hashtable<String,Hashtable<String,Integer>> dbNamespaces = new Hashtable<String,Hashtable<String,Integer>>();
58	65
59	66	public SearchEngine(){
60	67	if(global == null)
—	—	@@ -102,17 +109,87 @@
103	110	// TODO: return searchTitles(searchterm);
104	111	} else if (what.equals("prefix")){
105	112	return prefixSearch(iid, searchterm);
	113	+ } else if (what.equals("related")){
	114	+ int offset = 0, limit = 100; boolean exactCase = false;
	115	+ if (query.containsKey("offset"))
	116	+ offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
	117	+ if (query.containsKey("limit"))
	118	+ limit = Math.min(Integer.parseInt((String)query.get("limit")), maxlines);
	119	+ return relatedSearch(iid, searchterm, offset, limit);
106	120	} else {
107	121	SearchResults res = new SearchResults();
108	122	res.setErrorMsg("Unrecognized search type. Try one of: " +
109		~~- "search, explain, raw, rawexplain, prefix.");~~
	123	+ "search, explain, raw, rawexplain, prefix, related.");
110	124	log.warn("Unknown request type [" + what + "].");
111	125	return res;
112	126	}
113	127	return null;
114	128	}
115	129
116		~~- private SearchResults prefixSearch(IndexId iid, String searchterm) {~~
	130	+ /** Convert User:Rainman into 2:Rainman */
	131	+ protected String getKey(String title, IndexId iid){
	132	+ int colon = title.indexOf(':');
	133	+ if(colon != -1 && colon != title.length()-1){
	134	+ String ns = title.substring(0,colon);
	135	+ Integer inx = dbNamespaces.get(iid.getDBname()).get(ns.toLowerCase());
	136	+ if(inx != null){
	137	+ return inx +":"+ title.substring(colon+1);
	138	+ }
	139	+ }
	140	+
	141	+ return "0:" + title;
	142	+ }
	143	+
	144	+ protected SearchResults relatedSearch(IndexId iid, String searchterm, int offset, int limit) {
	145	+ readLocalization(iid);
	146	+ IndexId rel = iid.getRelated();
	147	+ IndexId lin = iid.getLinks();
	148	+ SearcherCache cache = SearcherCache.getInstance();
	149	+ SearchResults res = new SearchResults();
	150	+ try {
	151	+ IndexSearcherMul searcher = cache.getLocalSearcher(rel);
	152	+ IndexReader reader = searcher.getIndexReader();
	153	+ String key = getKey(searchterm,iid);
	154	+ TermDocs td = reader.termDocs(new Term("key",key));
	155	+ if(td.next()){
	156	+ ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection());
	157	+ res.setNumHits(col.size());
	158	+ res.setSuccess(true);
	159	+ // TODO: this is extremely slow
	160	+ Links links = Links.openForRead(lin,lin.getSearchPath());
	161	+ for(int i=offset;i<offset+limit && i<col.size();i++){
	162	+ RelatedTitle rt = col.get(i);
	163	+ Title t = rt.getRelated();
	164	+ ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle());
	165	+ rs.addContext(links.getContext(t.getKey(),key));
	166	+ res.addResult(rs);
	167	+ }
	168	+ } else{
	169	+ res.setSuccess(true);
	170	+ res.setNumHits(0);
	171	+ }
	172	+ } catch (IOException e) {
	173	+ e.printStackTrace();
	174	+ log.error("I/O error in relatedSearch on "+rel+" : "+e.getMessage());
	175	+ res.setErrorMsg("I/O Error processing index for "+rel);
	176	+ }
	177	+ return res;
	178	+ }
	179	+
	180	+ protected void readLocalization(IndexId iid){
	181	+ if(!dbNamespaces.containsKey(iid.getDBname())){
	182	+ synchronized(dbNamespaces){
	183	+ HashMap<String,Integer> m = Localization.getLocalizedNamespaces(iid.getLangCode(),iid.getDBname());
	184	+ Hashtable<String,Integer> map = new Hashtable<String,Integer>();
	185	+ if(m != null)
	186	+ map.putAll(m);
	187	+ dbNamespaces.put(iid.getDBname(),map);
	188	+ }
	189	+ }
	190	+ }
	191	+
	192	+ protected SearchResults prefixSearch(IndexId iid, String searchterm) {
	193	+ readLocalization(iid);
117	194	IndexId pre = iid.getPrefix();
118	195	SearcherCache cache = SearcherCache.getInstance();
119	196	SearchResults res = new SearchResults();
—	—	@@ -144,7 +221,8 @@
145	222	}
146	223	} catch (IOException e) {
147	224	// res.setErrorMsg("Internal error during prefix search: "+e.getMessage());
148		~~- log.error("Internal error in SearchEngine::prefixSearch : "+e.getMessage());~~
	225	+ log.error("Internal error in prefixSearch on "+pre+" : "+e.getMessage());
	226	+ res.setErrorMsg("I/O error on index "+pre);
149	227	}
150	228	return res;
151	229	}
—	—	@@ -166,9 +244,10 @@
167	245	localfilter = null;
168	246	if(localfilter != null)
169	247	log.info("Using local filter: "+localfilter);
170		~~- Hits hits = searcher.search(q,localfilter);~~
	248	+ TopDocs hits = searcher.search(q,localfilter,offset+limit);
171	249	return makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
172	250	} catch (IOException e) {
	251	+ e.printStackTrace();
173	252	SearchResults res = new SearchResults();
174	253	res.setErrorMsg("Internal error in SearchEngine: "+e.getMessage());
175	254	log.error("Internal error in SearchEngine while trying to search main part: "+e.getMessage());
—	—	@@ -186,7 +265,7 @@
187	266	if(nsDefault == null \|\| nsDefault.cardinality() == 0)
188	267	nsDefault = new NamespaceFilter("0"); // default to main namespace
189	268	FieldBuilder.Case dCase = exactCase? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
190		~~- FieldBuilder.BuilderSet bs = new FieldBuilder(global.getLanguage(iid.getDBname()),dCase).getBuilder(dCase);~~
	269	+ FieldBuilder.BuilderSet bs = new FieldBuilder(iid,dCase).getBuilder(dCase);
191	270	ArrayList<String> stopWords = null;
192	271	try{
193	272	stopWords = StopWords.getCached(iid);
—	—	@@ -354,7 +433,8 @@
355	434
356	435	/** Our scores can span several orders of magnitude, transform them to be more relevant to the user */
357	436	public float transformScore(double score){
358		~~- return (float) (Math.log10(1+score*99)/2);~~
	437	+ //return (float) (Math.log10(1+score*99)/2);
	438	+ return (float) score;
359	439	}
360	440
361	441	protected SearchResults makeSearchResults(SearchableMul s, TopDocs hits, int offset, int limit, IndexId iid, String searchterm, Query q, long searchStart, boolean explain) throws IOException{
—	—	@@ -375,14 +455,15 @@
376	456	// fetch documents
377	457	Document[] docs = s.docs(docids);
378	458	int j=0;
379		~~- float maxScore = hits.getMaxScore();~~
	459	+ //float maxScore = hits.getMaxScore();
	460	+ float maxScore = 1;
380	461	for(Document doc : docs){
381	462	String namespace = doc.get("namespace");
382	463	String title = doc.get("title");
383	464	float score = transformScore(scores[j]/maxScore);
384	465	ResultSet rs = new ResultSet(score,namespace,title);
385	466	if(explain)
386		~~- rs.setExplanation(((WikiSearcher)s).explain(q,docids[j]));~~
	467	+ rs.setExplanation(((Searcher)s).explain(q,docids[j]));
387	468	res.addResult(rs);
388	469	j++;
389	470	}
—	—	@@ -410,8 +491,8 @@
411	492	Document[] docs = s.docs(docids);
412	493	int j=0;
413	494	float maxScore = 1;
414		~~- if(numhits>0)~~
415		~~- maxScore = hits.score(0);~~
	495	+ //if(numhits>0)
	496	+ // maxScore = hits.score(0);
416	497	for(Document doc : docs){
417	498	String namespace = doc.get("namespace");
418	499	String title = doc.get("title");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankValueSourceQuery.java
—	—	@@ -0,0 +1,178 @@
	2	+package org.wikimedia.lsearch.search;
	3	+
	4	+import java.io.IOException;
	5	+import java.util.Set;
	6	+
	7	+import org.apache.lucene.index.IndexReader;
	8	+import org.apache.lucene.search.ComplexExplanation;
	9	+import org.apache.lucene.search.Explanation;
	10	+import org.apache.lucene.search.Query;
	11	+import org.apache.lucene.search.Scorer;
	12	+import org.apache.lucene.search.Searcher;
	13	+import org.apache.lucene.search.Similarity;
	14	+import org.apache.lucene.search.Weight;
	15	+import org.apache.lucene.search.function.DocValues;
	16	+import org.apache.lucene.search.function.ValueSource;
	17	+import org.apache.lucene.search.function.ValueSourceQuery;
	18	+import org.apache.lucene.util.ToStringUtils;
	19	+
	20	+public class RankValueSourceQuery extends ValueSourceQuery {
	21	+ protected ValueSource valSrc;
	22	+ /**
	23	+ * Create a value source query
	24	+ * @param valSrc provides the values defines the function to be used for scoring
	25	+ */
	26	+ public RankValueSourceQuery(ValueSource valSrc) {
	27	+ super(valSrc);
	28	+ this.valSrc = valSrc;
	29	+ }
	30	+
	31	+ /(non-Javadoc) @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) /
	32	+ public Query rewrite(IndexReader reader) throws IOException {
	33	+ return this;
	34	+ }
	35	+
	36	+ /(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(java.util.Set) /
	37	+ public void extractTerms(Set terms) {
	38	+ // no terms involved here
	39	+ }
	40	+
	41	+ private class ValueSourceWeight implements Weight {
	42	+ Searcher searcher;
	43	+ float queryNorm;
	44	+ float queryWeight;
	45	+
	46	+ public ValueSourceWeight(Searcher searcher) {
	47	+ this.searcher = searcher;
	48	+ }
	49	+
	50	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#getQuery() /
	51	+ public Query getQuery() {
	52	+ return RankValueSourceQuery.this;
	53	+ }
	54	+
	55	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() /
	56	+ public float getValue() {
	57	+ return queryWeight;
	58	+ }
	59	+
	60	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() /
	61	+ public float sumOfSquaredWeights() throws IOException {
	62	+ queryWeight = getBoost();
	63	+ return queryWeight * queryWeight;
	64	+ }
	65	+
	66	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) /
	67	+ public void normalize(float norm) {
	68	+ this.queryNorm = 1;
	69	+ queryWeight *= this.queryNorm;
	70	+ }
	71	+
	72	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#scorer(org.apache.lucene.index.IndexReader) /
	73	+ public Scorer scorer(IndexReader reader) throws IOException {
	74	+ return new ValueSourceScorer(getSimilarity(searcher), reader, this);
	75	+ }
	76	+
	77	+ /(non-Javadoc) @see org.apache.lucene.search.Weight#explain(org.apache.lucene.index.IndexReader, int) /
	78	+ public Explanation explain(IndexReader reader, int doc) throws IOException {
	79	+ return scorer(reader).explain(doc);
	80	+ }
	81	+ }
	82	+
	83	+ /**
	84	+ * A scorer that (simply) matches all documents, and scores each document with
	85	+ * the value of the value soure in effect. As an example, if the value source
	86	+ * is a (cached) field source, then value of that field in that document will
	87	+ * be used. (assuming field is indexed for this doc, with a single token.)
	88	+ */
	89	+ private class ValueSourceScorer extends Scorer {
	90	+ private final IndexReader reader;
	91	+ private final ValueSourceWeight weight;
	92	+ private final int maxDoc;
	93	+ private final float qWeight;
	94	+ private int doc=-1;
	95	+ private final DocValues vals;
	96	+
	97	+ // constructor
	98	+ private ValueSourceScorer(Similarity similarity, IndexReader reader, ValueSourceWeight w) throws IOException {
	99	+ super(similarity);
	100	+ this.weight = w;
	101	+ this.qWeight = w.getValue();
	102	+ this.reader = reader;
	103	+ this.maxDoc = reader.maxDoc();
	104	+ // this is when/where the values are first created.
	105	+ vals = valSrc.getValues(reader);
	106	+ }
	107	+
	108	+ /(non-Javadoc) @see org.apache.lucene.search.Scorer#next() /
	109	+ public boolean next() throws IOException {
	110	+ for(;;) {
	111	+ ++doc;
	112	+ if (doc>=maxDoc) {
	113	+ return false;
	114	+ }
	115	+ if (reader.isDeleted(doc)) {
	116	+ continue;
	117	+ }
	118	+ return true;
	119	+ }
	120	+ }
	121	+
	122	+ /*(non-Javadoc) @see org.apache.lucene.search.Scorer#doc()
	123	+ */
	124	+ public int doc() {
	125	+ return doc;
	126	+ }
	127	+
	128	+ /(non-Javadoc) @see org.apache.lucene.search.Scorer#score() /
	129	+ public float score() throws IOException {
	130	+ return qWeight * vals.floatVal(doc);
	131	+ }
	132	+
	133	+ /(non-Javadoc) @see org.apache.lucene.search.Scorer#skipTo(int) /
	134	+ public boolean skipTo(int target) throws IOException {
	135	+ doc=target-1;
	136	+ return next();
	137	+ }
	138	+
	139	+ /(non-Javadoc) @see org.apache.lucene.search.Scorer#explain(int) /
	140	+ public Explanation explain(int doc) throws IOException {
	141	+ float sc = qWeight * vals.floatVal(doc);
	142	+
	143	+ Explanation result = new ComplexExplanation(
	144	+ true, sc, RankValueSourceQuery.this.toString() + ", product of:");
	145	+
	146	+ result.addDetail(vals.explain(doc));
	147	+ result.addDetail(new Explanation(getBoost(), "boost"));
	148	+ result.addDetail(new Explanation(weight.queryNorm,"queryNorm"));
	149	+ return result;
	150	+ }
	151	+ }
	152	+
	153	+ /(non-Javadoc) @see org.apache.lucene.search.Query#createWeight(org.apache.lucene.search.Searcher) /
	154	+ protected Weight createWeight(Searcher searcher) {
	155	+ return new RankValueSourceQuery.ValueSourceWeight(searcher);
	156	+ }
	157	+
	158	+ /* (non-Javadoc) @see org.apache.lucene.search.Query#toString(java.lang.String) */
	159	+ public String toString(String field) {
	160	+ return valSrc.toString() + ToStringUtils.boost(getBoost());
	161	+ }
	162	+
	163	+ /** Returns true if <code>o</code> is equal to this. */
	164	+ public boolean equals(Object o) {
	165	+ if (getClass() != o.getClass()) {
	166	+ return false;
	167	+ }
	168	+ RankValueSourceQuery other = (RankValueSourceQuery)o;
	169	+ return this.getBoost() == other.getBoost()
	170	+ && this.valSrc.equals(other.valSrc);
	171	+ }
	172	+
	173	+ /** Returns a hash code value for this object. */
	174	+ public int hashCode() {
	175	+ return (getClass().hashCode() + valSrc.hashCode()) ^ Float.floatToIntBits(getBoost());
	176	+ }
	177	+
	178	+
	179	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankDocValues.java
—	—	@@ -0,0 +1,36 @@
	2	+package org.wikimedia.lsearch.search;
	3	+
	4	+import java.io.IOException;
	5	+
	6	+import org.apache.lucene.index.CorruptIndexException;
	7	+import org.apache.lucene.index.IndexReader;
	8	+import org.apache.lucene.index.TermDocs;
	9	+import org.apache.lucene.search.function.DocValues;
	10	+
	11	+public class RankDocValues extends DocValues {
	12	+ IndexReader reader;
	13	+
	14	+ public RankDocValues(IndexReader reader){
	15	+ super(reader.maxDoc());
	16	+ this.reader = reader;
	17	+ }
	18	+
	19	+ protected int getValue(int doc){
	20	+ try{
	21	+ return Integer.parseInt(reader.document(doc).get("rank"));
	22	+ } catch(IOException e){
	23	+ return 0;
	24	+ }
	25	+ }
	26	+
	27	+ @Override
	28	+ public float floatVal(int doc) {
	29	+ return getValue(doc);
	30	+ }
	31	+
	32	+ @Override
	33	+ public String toString(int doc) {
	34	+ return "rank: "+getValue(doc);
	35	+ }
	36	+
	37	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankValueSource.java
—	—	@@ -0,0 +1,34 @@
	2	+package org.wikimedia.lsearch.search;
	3	+
	4	+import java.io.IOException;
	5	+
	6	+import org.apache.lucene.index.IndexReader;
	7	+import org.apache.lucene.search.function.DocValues;
	8	+import org.apache.lucene.search.function.ValueSource;
	9	+
	10	+public class RankValueSource extends ValueSource {
	11	+
	12	+ @Override
	13	+ public String description() {
	14	+ return "";
	15	+ }
	16	+
	17	+ @Override
	18	+ public boolean equals(Object o) {
	19	+ if(o == this)
	20	+ return true;
	21	+ else
	22	+ return false;
	23	+ }
	24	+
	25	+ @Override
	26	+ public DocValues getValues(IndexReader reader) throws IOException {
	27	+ return new RankDocValues(reader);
	28	+ }
	29	+
	30	+ @Override
	31	+ public int hashCode() {
	32	+ return 0;
	33	+ }
	34	+
	35	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java
—	—	@@ -27,6 +27,7 @@
28	28	import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
29	29	import org.wikimedia.lsearch.interoperability.RMIServer;
30	30	import org.wikimedia.lsearch.util.Command;
	31	+import org.wikimedia.lsearch.util.FSUtils;
31	32
32	33
33	34	/**
—	—	@@ -179,16 +180,16 @@
180	181	try{
181	182	// if local, use cp -lr instead of rsync
182	183	if(global.isLocalhost(iid.getIndexHost())){
183		~~- Command.exec("/bin/cp -lr "+iid.getSnapshotPath()+sep+li.timestamp+" "+iid.getUpdatePath());~~
	184	+ FSUtils.createHardLinkRecursive(
	185	+ iid.getSnapshotPath()+sep+li.timestamp,
	186	+ updatepath);
184	187	} else{
185	188	File ind = new File(iid.getCanonicalSearchPath());
186	189
187	190	if(ind.exists()){ // prepare a local hard-linked copy of index
188		~~- ind = ind.getCanonicalFile();~~
189		~~- for(File f: ind.listFiles()){~~
190		~~- // a cp -lr command for each file in the index~~
191		~~- Command.exec("/bin/cp -lr "+ind.getCanonicalPath()+sep+f.getName()+" "+updatepath+sep+f.getName());~~
192		~~- }~~
	191	+ FSUtils.createHardLinkRecursive(
	192	+ ind.getCanonicalPath(),
	193	+ updatepath);
193	194	}
194	195	long startTime = System.currentTimeMillis();
195	196	// rsync
—	—	@@ -208,8 +209,8 @@
209	210	SearcherCache.SearcherPool pool = new SearcherCache.SearcherPool(iid,li.path,cache.getSearchPoolSize());
210	211
211	212	// refresh the symlink
212		~~- Command.exec("/bin/rm -rf "+iid.getSearchPath());~~
213		~~- Command.exec("/bin/ln -fs "+updatepath+" "+iid.getSearchPath());~~
	213	+ FSUtils.delete(iid.getSearchPath());
	214	+ FSUtils.createSymLink(updatepath,iid.getSearchPath());
214	215
215	216	// update registry, cache, rmi object
216	217	registry.refreshUpdates(iid);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java
—	—	@@ -65,7 +65,7 @@
66	66	/** Warmup index using some number of simple searches */
67	67	protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) {
68	68	String lang = global.getLanguage(iid.getDBname());
69		~~- FieldBuilder.BuilderSet b = new FieldBuilder(lang).getBuilder();~~
	69	+ FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder();
70	70	WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null);
71	71	Terms terms = getTermsForLang(lang);
72	72
—	—	@@ -122,7 +122,7 @@
123	123	public static void simpleWarmup(IndexSearcherMul is, IndexId iid){
124	124	try{
125	125	String lang = global.getLanguage(iid.getDBname());
126		~~- FieldBuilder.BuilderSet b = new FieldBuilder(lang).getBuilder();~~
	126	+ FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder();
127	127	WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null);
128	128	Query q = parser.parseFourPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
129	129	is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0")));
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
—	—	@@ -27,6 +27,7 @@
28	28	import org.apache.lucene.store.Directory;
29	29	import org.apache.lucene.store.FSDirectory;
30	30	import org.wikimedia.lsearch.analyzers.Analyzers;
	31	+import org.wikimedia.lsearch.analyzers.ContextAnalyzer;
31	32	import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
32	33	import org.wikimedia.lsearch.analyzers.FieldBuilder;
33	34	import org.wikimedia.lsearch.analyzers.FieldNameFactory;
—	—	@@ -41,6 +42,7 @@
42	43	import org.wikimedia.lsearch.config.GlobalConfiguration;
43	44	import org.wikimedia.lsearch.config.IndexId;
44	45	import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
	46	+import org.wikimedia.lsearch.ranks.Links;
45	47	import org.wikimedia.lsearch.related.RelatedTitle;
46	48	import org.wikimedia.lsearch.spell.api.SpellCheckIndexer;
47	49	import org.wikimedia.lsearch.util.Localization;
—	—	@@ -169,7 +171,15 @@
170	172	writer.setUseCompoundFile(true);
171	173	writer.setMaxFieldLength(MAX_FIELD_LENGTH);
172	174	FieldBuilder.Case dCase = (exactCase)? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
173		~~- FieldBuilder builder = new FieldBuilder(langCode,dCase);~~
	175	+ FieldBuilder builder = new FieldBuilder(iid,dCase);
	176	+ // TODO: fixme
	177	+ Links links = null;
	178	+ try {
	179	+ links = Links.openForRead(iid,iid.getImportPath());
	180	+ } catch (IOException e1) {
	181	+ // TODO Auto-generated catch block
	182	+ e1.printStackTrace();
	183	+ }
174	184
175	185	for(IndexUpdateRecord rec : records){
176	186	if(rec.doAdd()){
—	—	@@ -178,7 +188,7 @@
179	189	if(!checkPreconditions(rec))
180	190	continue; // article shouldn't be added for some reason
181	191	IndexReportCard card = getReportCard(rec);
182		~~- Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),builder,iid);~~
	192	+ Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),builder,iid,links);
183	193	Document doc = (Document) ret[0];
184	194	Analyzer analyzer = (Analyzer) ret[1];
185	195	try {
—	—	@@ -400,9 +410,8 @@
401	411	* @param languageAnalyzer
402	412	* @return array { document, analyzer }
403	413	*/
404		~~- public static Object[] makeDocumentAndAnalyzer(Article article, FieldBuilder builder, IndexId iid){~~
	414	+ public static Object[] makeDocumentAndAnalyzer(Article article, FieldBuilder builder, IndexId iid, Links links){
405	415	PerFieldAnalyzerWrapper perFieldAnalyzer = null;
406		~~- WikiTokenizer tokenizer = null;~~
407	416	Document doc = new Document();
408	417
409	418	// tranform record so that unnecessary stuff is deleted, e.g. some redirects
—	—	@@ -463,8 +472,10 @@
464	473	doc.add(contents);
465	474
466	475	// related articles
467		~~- p = makeRelated(doc,fields.related(),article,1);~~
	476	+ p = makeRelated(doc,fields.related(),article,1,fields.context());
468	477
	478	+ //makeContextField(doc,fields.context(),fields.related());
	479	+
469	480	// anchors
470	481	// makeKeywordField(doc,fields.anchor(),rankBoost);
471	482
—	—	@@ -479,7 +490,7 @@
480	491	}
481	492	// make analyzer
482	493	String text = article.getContents();
483		~~- Object[] ret = Analyzers.getIndexerAnalyzer(text,builder,article.getRedirectKeywords(),article.getAnchorText(),article.getRelated(),p);~~
	494	+ Object[] ret = Analyzers.getIndexerAnalyzer(text,builder,article.getRedirectKeywords(),article.getAnchorText(),article.getRelated(),p,article.makeTitle(),links);
484	495	perFieldAnalyzer = (PerFieldAnalyzerWrapper) ret[0];
485	496
486	497
—	—	@@ -487,7 +498,7 @@
488	499	}
489	500
490	501	/** Returns partioning of related titles, or null if there aren't any */
491		~~- protected static int[] makeRelated(Document doc, String prefix, Article article, float boost) {~~
	502	+ protected static int[] makeRelated(Document doc, String prefix, Article article, float boost, String context) {
492	503	ArrayList<RelatedTitle> rel = article.getRelated();
493	504	if(rel == null \|\| rel.size()==0)
494	505	return null;
—	—	@@ -501,14 +512,32 @@
502	513	for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){
503	514	Field relfield = new Field(prefix+i, "",
504	515	Field.Store.NO, Field.Index.TOKENIZED);
505		~~- relfield.setBoost(boost*(float)MathFunc.avg(scores,p[i-1],p[i]));~~
	516	+ float fb = boost*(float)MathFunc.avg(scores,p[i-1],p[i]);
	517	+ relfield.setBoost(fb);
506	518	doc.add(relfield);
	519	+ if(i <= ContextAnalyzer.CONTEXT_GROUPS){
	520	+ Field confield = new Field(context+i, "",
	521	+ Field.Store.NO, Field.Index.TOKENIZED);
	522	+ confield.setBoost(fb); // use same boost as related field
	523	+ doc.add(confield);
	524	+ }
507	525	}
508	526
509	527	return p;
510	528	}
511	529
512		~~- /** Make a multiple keyword field, e.g. redirect1, redirect2, redirect3 ... */~~
	530	+ /** Make a multiple context field ... */
	531	+ protected static void makeContextField(Document doc, String prefix, String related) {
	532	+ for(int i=1;i<=ContextAnalyzer.CONTEXT_GROUPS;i++){
	533	+ Field keyfield = new Field(prefix+i, "",
	534	+ Field.Store.NO, Field.Index.TOKENIZED);
	535	+ keyfield.setBoost(doc.getField(related+i).getBoost()); // use same boost as related field
	536	+ doc.add(keyfield);
	537	+ }
	538	+
	539	+ }
	540	+
	541	+ /** Make a multiple keyword field, e.g. keyword1, keyword2, keyword3 ... */
513	542	protected static void makeKeywordField(Document doc, String prefix, float boost) {
514	543	for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){
515	544	Field keyfield = new Field(prefix+i, "",
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiSimilarity.java
—	—	@@ -39,7 +39,7 @@
40	40	float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
41	41	//log.debug("Length-norm: "+f+", numtokens: "+numTokens);
42	42	return f;
43		~~- } else if(fieldName.startsWith("redirect") \|\| fieldName.startsWith("keyword") \|\| fieldName.startsWith("related") \|\| fieldName.startsWith("anchor")){~~
	43	+ } else if(fieldName.startsWith("redirect") \|\| fieldName.startsWith("keyword") \|\| fieldName.startsWith("related") \|\| fieldName.startsWith("anchor") \|\| fieldName.startsWith("context")){
44	44	return 1;
45	45	} else
46	46	return super.lengthNorm(fieldName,numTokens);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java
—	—	@@ -38,6 +38,7 @@
39	39	import org.wikimedia.lsearch.config.IndexRegistry;
40	40	import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
41	41	import org.wikimedia.lsearch.util.Command;
	42	+import org.wikimedia.lsearch.util.FSUtils;
42	43
43	44	/**
44	45	* Indexer.
—	—	@@ -235,20 +236,6 @@
236	237	}
237	238	}
238	239
239		~~- protected static void deleteDirRecursive(File file){~~
240		~~- if(!file.exists())~~
241		~~- return;~~
242		~~- else if(file.isDirectory()){~~
243		~~- File[] files = file.listFiles();~~
244		~~- for(File f: files)~~
245		~~- deleteDirRecursive(f);~~
246		~~- file.delete();~~
247		~~- log.debug("Deleted old snapshot at "+file);~~
248		~~- } else{~~
249		~~- file.delete();~~
250		~~- }~~
251		~~- }~~
252		-
253	240	/**
254	241	* Make a snapshot of all changed indexes
255	242	*
—	—	@@ -296,20 +283,27 @@
297	284	File[] files = spd.listFiles();
298	285	for(File f: files){
299	286	if(!f.getAbsolutePath().equals(li.path)) // leave the last snapshot
300		~~- deleteDirRecursive(f);~~
	287	+ FSUtils.deleteRecursive(f);
301	288	}
302	289	}
303	290	new File(snapshot).mkdirs();
	291	+ try {
	292	+ FSUtils.createHardLinkRecursive(indexPath,snapshot);
	293	+ } catch (IOException e) {
	294	+ log.error("Error making snapshot "+snapshot+": "+e.getMessage());
	295	+ return;
	296	+ }
	297	+ /*
304	298	File ind =new File(indexPath);
305	299	for(File f: ind.listFiles()){
306		~~- // use a cp -lr command for each file in the index~~
	300	+ // hardlink the snapshot
307	301	try {
308	302	Command.exec("/bin/cp -lr "+indexPath+sep+f.getName()+" "+snapshot+sep+f.getName());
309	303	} catch (IOException e) {
310	304	log.error("Error making snapshot "+snapshot+": "+e.getMessage());
311	305	continue;
312	306	}
313		~~- }~~
	307	+ } */
314	308	log.info("Made snapshot "+snapshot);
315	309	}
316	310
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/HighlightDaemon.java
—	—	@@ -128,7 +128,7 @@
129	129	FieldBuilder.Case dCase = exactCase? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
130	130	String lang = global.getLanguage(dbname);
131	131	Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase);
132		~~- FieldBuilder.BuilderSet bs = new FieldBuilder(lang,dCase).getBuilder(dCase);~~
	132	+ FieldBuilder.BuilderSet bs = new FieldBuilder(iid,dCase).getBuilder(dCase);
133	133	WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),
134	134	new NamespaceFilter("0"),analyzer,bs,WikiQueryParser.NamespacePolicy.IGNORE,null);
135	135	Query q = parser.parseFourPass(query,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
—	—	@@ -139,7 +139,7 @@
140	140
141	141	for(Article ar : articles){
142	142	log.debug("Sending highlighted text for "+ar);
143		~~- String clean = new CleanupParser(ar.getContents(),lang).parse();~~
	143	+ String clean = new CleanupParser(ar.getContents(),iid).parse();
144	144	TokenStream tokens = analyzer.tokenStream("contents",clean);
145	145	out.println("HIGHLIGHTING "+ar.getNamespace()+" "+ar.getTitle());
146	146	String[] highlighted = highlighter.getBestFragments(tokens,clean,segments);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/CleanupParser.java
—	—	@@ -3,6 +3,7 @@
4	4	import java.util.HashSet;
5	5	import java.util.Hashtable;
6	6
	7	+import org.wikimedia.lsearch.config.IndexId;
7	8	import org.wikimedia.lsearch.util.Localization;
8	9
9	10	/**
—	—	@@ -34,6 +35,7 @@
35	36
36	37	/** language code */
37	38	private String language;
	39	+ private IndexId iid;
38	40	/** language code -> set (image namespace names) */
39	41	private static Hashtable<String,HashSet<String>> imageLocalized = new Hashtable<String,HashSet<String>>();
40	42	/** language code -> set (category namespace names) */
—	—	@@ -47,10 +49,11 @@
48	50
49	51	enum FetchState { WORD, CATEGORY, INTERWIKI, KEYWORD };
50	52
51		~~- public CleanupParser(String text, String lang){~~
	53	+ public CleanupParser(String text, IndexId iid){
52	54	this.text = text.toCharArray();
53	55	this.textString = text;
54		~~- this.language = lang;~~
	56	+ this.iid = iid;
	57	+ this.language = iid.getLangCode();
55	58	textLength = text.length();
56	59	out = new char[textLength];
57	60	}
—	—	@@ -409,7 +412,7 @@
410	413	else if(language!=null && language.length()!=0){
411	414	HashSet<String> loc = imageLocalized.get(language);
412	415	if(loc == null){
413		~~- loc = Localization.getLocalizedImage(language);~~
	416	+ loc = Localization.getLocalizedImage(language,iid.getDBname());
414	417	imageLocalized.put(language,loc);
415	418	}
416	419	if(loc.contains(prefix))
—	—	@@ -426,7 +429,7 @@
427	430	else if(language!=null && language.length()!=0){
428	431	HashSet<String> loc = categoryLocalized.get(language);
429	432	if(loc == null){
430		~~- loc = Localization.getLocalizedCategory(language);~~
	433	+ loc = Localization.getLocalizedCategory(language,iid.getDBname());
431	434	categoryLocalized.put(language,loc);
432	435	}
433	436	if(loc.contains(prefix))
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
—	—	@@ -83,7 +83,7 @@
84	84	Iterator it = info.Namespaces.orderedEntries();
85	85	while(it.hasNext()){
86	86	Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
87		~~- Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);~~
	87	+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),iid.getDBname());
88	88	}
89	89	}
90	90
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -16,11 +16,17 @@
17	17	import org.apache.lucene.queryParser.ParseException;
18	18	import org.apache.lucene.search.BooleanClause;
19	19	import org.apache.lucene.search.BooleanQuery;
	20	+import org.apache.lucene.search.CustomBoostQuery;
	21	+import org.apache.lucene.search.Explanation;
20	22	import org.apache.lucene.search.PhraseQuery;
21	23	import org.apache.lucene.search.Query;
22	24	import org.apache.lucene.search.TermQuery;
23	25	import org.apache.lucene.search.WildcardQuery;
24	26	import org.apache.lucene.search.BooleanClause.Occur;
	27	+import org.apache.lucene.search.function.CustomScoreQuery;
	28	+import org.apache.lucene.search.function.FieldScoreQuery;
	29	+import org.apache.lucene.search.function.ValueSource;
	30	+import org.apache.lucene.search.function.ValueSourceQuery;
25	31	import org.apache.lucene.search.spans.SpanNearQuery;
26	32	import org.apache.lucene.search.spans.SpanQuery;
27	33	import org.apache.lucene.search.spans.SpanTermQuery;
—	—	@@ -28,6 +34,8 @@
29	35	import org.wikimedia.lsearch.config.GlobalConfiguration;
30	36	import org.wikimedia.lsearch.index.WikiIndexModifier;
31	37	import org.wikimedia.lsearch.search.NamespaceFilter;
	38	+import org.wikimedia.lsearch.search.RankValueSource;
	39	+import org.wikimedia.lsearch.search.RankValueSourceQuery;
32	40	import org.wikimedia.lsearch.util.UnicodeDecomposer;
33	41
34	42	/**
—	—	@@ -88,12 +96,13 @@
89	97	public static float KEYWORD_BOOST = 0.02f;
90	98	public static float CONTENTS_BOOST = 0.2f;
91	99
92		~~- public static int ADDITIONAL_PHRASE_SLOP_CONTENTS = 20;~~
	100	+ public static int ADDITIONAL_PHRASE_SLOP_CONTENTS = 5000;
93	101	public static float ADDITIONAL_BOOST_CONTENTS = 0.5f;
94		~~- public static int ADDITIONAL_PHRASE_SLOP_TITLE = 1;~~
	102	+ public static int ADDITIONAL_PHRASE_SLOP_TITLE = 0;
95	103	public static float ADDITIONAL_BOOST_TITLE = 0.5f;
96		~~- public static int ADDITIONAL_PHRASE_SLOP_RELATED = 10;~~
97		~~- public static float ADDITIONAL_BOOST_RELATED = 0.04f;~~
	104	+ public static int ADDITIONAL_PHRASE_SLOP_RELATED = 0;
	105	+ public static float ADDITIONAL_BOOST_RELATED = 0.2f;
	106	+ public static float ADDITIONAL_BOOST_CONTEXT = 0.05f;
98	107
99	108	public static float WHOLE_TITLE_BOOST = 8f;
100	109	public static float EXACT_CONTENTS_BOOST = 1f;
—	—	@@ -1422,11 +1431,30 @@
1423	1432	pq.setSlop(slop);
1424	1433	return pq;
1425	1434	}
1426		-
	1435	+
1427	1436	/** Make phrase queries for additional scores */
1428	1437	public Query makePhraseQueries(ArrayList<String> words, String field, int slop, float boost){
1429	1438	if(words.size() <= 1)
1430	1439	return null;
	1440	+ else{
	1441	+ PhraseQuery pq = new PhraseQuery();
	1442	+ for(String w : words){
	1443	+ if(!stopWords.contains(w))
	1444	+ pq.add(new Term(field,w));
	1445	+ }
	1446	+ pq.setSlop(slop);
	1447	+ pq.setBoost(boost);
	1448	+ return pq;
	1449	+ }
	1450	+
	1451	+ }
	1452	+
	1453	+
	1454	+ /** Make phrase queries for additional scores */
	1455	+ @Deprecated
	1456	+ public Query makePhraseQueriesOld(ArrayList<String> words, String field, int slop, float boost){
	1457	+ if(words.size() <= 1)
	1458	+ return null;
1431	1459	else if(words.size() == 2){
1432	1460	PhraseQuery pq = makePhrase(words,field,slop);
1433	1461	pq.setBoost(boost);
—	—	@@ -1550,26 +1578,73 @@
1551	1579	// skip last related group
1552	1580	Query[] pqr = new Query[RelatedAnalyzer.RELATED_GROUPS-1];
1553	1581	for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){
1554		~~- pqr[i-1] = makePhraseQueries(words,"related"+i,ADDITIONAL_PHRASE_SLOP_RELATED,ADDITIONAL_BOOST_RELATED);~~
	1582	+ pqr[i-1] = makePhraseQueries(words,fields.related()+i,ADDITIONAL_PHRASE_SLOP_RELATED,ADDITIONAL_BOOST_RELATED);
1555	1583	}
1556	1584	Query[] wqr = new Query[RelatedAnalyzer.RELATED_GROUPS-1];
1557	1585	for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){
1558		~~- wqr[i-1] = makeWordQueries(words,"related"+i,ADDITIONAL_BOOST_RELATED / 4);~~
	1586	+ wqr[i-1] = makeWordQueries(words,fields.related()+i,ADDITIONAL_BOOST_RELATED / 4);
1559	1587	}
	1588	+ Query[] pqx = new Query[ContextAnalyzer.CONTEXT_GROUPS];
	1589	+ // make context queries
	1590	+ for(int i=1;i<=ContextAnalyzer.CONTEXT_GROUPS;i++){
	1591	+ pqx[i-1] = makePhraseQueries(words,fields.context()+i,0,ADDITIONAL_BOOST_CONTEXT);
	1592	+ }
1560	1593	if(wt==null && pqc == null && pqt == null && pqr[0] == null && wqr[0] == null)
1561	1594	return bq;
1562	1595	// build the final query
1563		~~- BooleanQuery finalQuery = new BooleanQuery(true);~~
	1596	+ BooleanQuery coreQuery = new BooleanQuery(true);
1564	1597	BooleanQuery additional = new BooleanQuery(true);
	1598	+ //BooleanQuery boostQuery = new BooleanQuery(true);
1565	1599
1566		~~- if(pqc != null)~~
1567		~~- additional.add(pqc,Occur.MUST);~~
	1600	+ if(pqc != null){
	1601	+ //additional.add(pqc,Occur.MUST);
	1602	+ additional.add(new CustomScoreQuery(pqc, new RankValueSourceQuery(new RankValueSource())){
	1603	+ public float customScore(int doc, float subQueryScore, float valSrcScore) {
	1604	+ return (float) (subQueryScore * Math.log(Math.E+valSrcScore/15));
	1605	+ }
	1606	+ public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation valSrcExpl) {
	1607	+ float valSrcScore = valSrcExpl==null ? 1 : valSrcExpl.getValue();
	1608	+ Explanation exp = new Explanation( (float)Math.log(Math.E+valSrcScore/15) * subQueryExpl.getValue(), ": "+valSrcScore+" "+(float)Math.log(Math.E+valSrcScore/15)+"*"+subQueryExpl.getValue()+" custom score: product of:");
	1609	+ exp.addDetail(subQueryExpl);
	1610	+ if (valSrcExpl != null) {
	1611	+ exp.addDetail(valSrcExpl);
	1612	+ }
	1613	+ return exp;
	1614	+ }
	1615	+ },Occur.MUST);
	1616	+ }
1568	1617	if(pqt != null)
1569	1618	additional.add(pqt,Occur.SHOULD);
1570	1619	if(wt != null)
1571	1620	additional.add(wt,Occur.SHOULD);
1572		~~- if(wc != null)~~
1573		~~- additional.add(wc,Occur.SHOULD);~~
	1621	+ if(wc != null){
	1622	+ // additional.add(wc,Occur.SHOULD);
	1623	+ BooleanQuery boostExact = new BooleanQuery();
	1624	+ for(Query q : pqr){
	1625	+ if(q != null)
	1626	+ boostExact.add(q,Occur.SHOULD);
	1627	+ }
	1628	+ for(Query q : wqr){
	1629	+ if(q != null)
	1630	+ boostExact.add(q,Occur.SHOULD);
	1631	+ }
	1632	+ CustomBoostQuery cbq = new CustomBoostQuery(wc,boostExact);
	1633	+ /*CustomScoreQuery csq = new CustomScoreQuery(cbq, new RankValueSourceQuery(new RankValueSource())) {
	1634	+ public float customScore(int doc, float subQueryScore, float valSrcScore) {
	1635	+ return (float) (subQueryScore * Math.log10(10+valSrcScore));
	1636	+ }
	1637	+ public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation valSrcExpl) {
	1638	+ float valSrcScore = valSrcExpl==null ? 1 : valSrcExpl.getValue();
	1639	+ Explanation exp = new Explanation( (float)Math.log10(10+valSrcScore) * subQueryExpl.getValue(), "custom score: product of:");
	1640	+ exp.addDetail(subQueryExpl);
	1641	+ if (valSrcExpl != null) {
	1642	+ exp.addDetail(valSrcExpl);
	1643	+ }
	1644	+ return exp;
	1645	+ }
	1646	+ }; */
	1647	+ additional.add(cbq,Occur.SHOULD);
	1648	+ }
1574	1649	for(Query q : pqr){
1575	1650	if(q != null)
1576	1651	additional.add(q,Occur.SHOULD);
—	—	@@ -1578,16 +1653,21 @@
1579	1654	if(q != null)
1580	1655	additional.add(q,Occur.SHOULD);
1581	1656	}
	1657	+ /*for(Query q : pqx){
	1658	+ if(q != null)
	1659	+ additional.add(q,Occur.SHOULD);
	1660	+ } */
1582	1661
1583	1662	// anchors
1584	1663	//Query anchors = multiplySpans(nostem,0,fields.anchor(),ANCHOR_BOOST);
1585	1664
1586		~~- finalQuery.add(bq,Occur.MUST);~~
1587		~~- finalQuery.add(additional,Occur.SHOULD);~~
	1665	+ coreQuery.add(bq,Occur.MUST);
	1666	+ coreQuery.add(additional,Occur.SHOULD);
1588	1667	//if(anchors != null)
1589	1668	// finalQuery.add(anchors,Occur.SHOULD);
1590	1669
1591		~~- return finalQuery;~~
	1670	+ return coreQuery;
	1671	+ //return new CustomBoostQuery(coreQuery,boostQuery);
1592	1672
1593	1673	}
1594	1674
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java
—	—	@@ -15,9 +15,11 @@
16	16	import org.apache.lucene.analysis.ru.RussianStemFilter;
17	17	import org.apache.lucene.analysis.th.ThaiWordFilter;
18	18	import org.apache.lucene.search.FieldSortedHitQueue;
	19	+import org.wikimedia.lsearch.beans.Title;
19	20	import org.wikimedia.lsearch.config.GlobalConfiguration;
20	21	import org.wikimedia.lsearch.config.IndexId;
21	22	import org.wikimedia.lsearch.index.WikiIndexModifier;
	23	+import org.wikimedia.lsearch.ranks.Links;
22	24	import org.wikimedia.lsearch.related.RelatedTitle;
23	25	import org.wikimedia.lsearch.test.AliasPorterStemFilter;
24	26
—	—	@@ -54,12 +56,13 @@
55	57	* @param languageAnalyzer language filter class (e.g. PorterStemFilter)
56	58	* @return {PerFieldAnalyzerWrapper,WikiTokenizer}
57	59	*/
58		~~- public static Object[] getIndexerAnalyzer(String text, FieldBuilder builder, ArrayList<String> redirects, ArrayList<String> anchors, ArrayList<RelatedTitle> related, int[] relatedPartition) {~~
	60	+ public static Object[] getIndexerAnalyzer(String text, FieldBuilder builder, ArrayList<String> redirects, ArrayList<String> anchors,
	61	+ ArrayList<RelatedTitle> related, int[] relatedPartition, Title title, Links links) {
59	62	PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer());
60	63	WikiTokenizer tokenizer = null;
61	64	for(FieldBuilder.BuilderSet bs : builder.getBuilders()){
62	65	tokenizer = addFieldsForIndexing(perFieldAnalyzer, text, bs.getFilters(), bs.getFields(),
63		~~- redirects, anchors, related, relatedPartition, bs.isExactCase(), bs.isAddKeywords());~~
	66	+ redirects, anchors, related, relatedPartition, title, links, bs.isExactCase(), bs.isAddKeywords());
64	67	}
65	68	return new Object[] {perFieldAnalyzer,tokenizer};
66	69	}
—	—	@@ -70,9 +73,9 @@
71	74	*/
72	75	public static WikiTokenizer addFieldsForIndexing(PerFieldAnalyzerWrapper perFieldAnalyzer, String text,
73	76	FilterFactory filters, FieldNameFactory fields, ArrayList<String> redirects, ArrayList<String> anchors,
74		~~- ArrayList<RelatedTitle> related, int[] relatedPartition, boolean exactCase, boolean addKeywords) {~~
	77	+ ArrayList<RelatedTitle> related, int[] relatedPartition, Title title, Links links, boolean exactCase, boolean addKeywords) {
75	78	// parse wiki-text to get categories
76		~~- WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase);~~
	79	+ WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getIndexId(),exactCase);
77	80	tokenizer.tokenize();
78	81	ArrayList<String> categories = tokenizer.getCategories();
79	82	HashMap<String,String> interwiki = tokenizer.getInterwikis();
—	—	@@ -106,6 +109,9 @@
107	110	// related
108	111	setRelatedAnalyzer(perFieldAnalyzer,fields.related(),
109	112	new RelatedAnalyzer(related,relatedPartition,filters.getNoStemmerFilterFactory(),fields.related(),exactCase));
	113	+ // context
	114	+ setContextAnalyzer(perFieldAnalyzer,fields.context(),
	115	+ new ContextAnalyzer(title,links,related,relatedPartition,filters.getNoStemmerFilterFactory(),fields.context(),exactCase));
110	116	return tokenizer;
111	117	}
112	118
—	—	@@ -126,24 +132,24 @@
127	133	perFieldAnalyzer.addAnalyzer(prefix+i,analyzer);
128	134	}
129	135	}
130		-
131		~~- public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid, boolean exactCase){~~
132		~~- if(global == null)~~
133		~~- global = GlobalConfiguration.getInstance();~~
134		~~- return getSearcherAnalyzer(global.getLanguage(iid.getDBname()),exactCase);~~
135		-
	136	+
	137	+ protected static void setContextAnalyzer(PerFieldAnalyzerWrapper perFieldAnalyzer, String prefix, ContextAnalyzer analyzer) {
	138	+ for(int i=1;i<=ContextAnalyzer.CONTEXT_GROUPS;i++){
	139	+ perFieldAnalyzer.addAnalyzer(prefix+i,analyzer);
	140	+ }
136	141	}
137	142
138		~~- public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode){~~
139		~~- return getSearcherAnalyzer(langCode,false);~~
	143	+
	144	+ public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){
	145	+ return getSearcherAnalyzer(iid,false);
140	146	}
141	147
142		~~- public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode, boolean exactCase){~~
143		~~- return getSearcherAnalyzer(new FilterFactory(langCode),new FieldNameFactory(exactCase));~~
	148	+ public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid, boolean exactCase){
	149	+ return getSearcherAnalyzer(new FilterFactory(iid),new FieldNameFactory(exactCase));
144	150	}
145	151
146		~~- public static PerFieldAnalyzerWrapper getSpellCheckAnalyzer(String langCode, HashSet<String> stopWords){~~
147		~~- FilterFactory filters = new FilterFactory(langCode,FilterFactory.Type.SPELL_CHECK);~~
	152	+ public static PerFieldAnalyzerWrapper getSpellCheckAnalyzer(IndexId iid, HashSet<String> stopWords){
	153	+ FilterFactory filters = new FilterFactory(iid,FilterFactory.Type.SPELL_CHECK);
148	154	filters.setStopWords(stopWords);
149	155	return getSearcherAnalyzer(filters,new FieldNameFactory());
150	156	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java
—	—	@@ -60,6 +60,13 @@
61	61	return "related";
62	62	}
63	63
	64	+ public String context(){
	65	+ if(exactCase)
	66	+ return "context_exact";
	67	+ else
	68	+ return "context";
	69	+ }
	70	+
64	71	public String anchor(){
65	72	if(exactCase)
66	73	return "anchor_exact";
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
—	—	@@ -9,6 +9,7 @@
10	10	import org.apache.lucene.analysis.Analyzer;
11	11	import org.apache.lucene.analysis.Token;
12	12	import org.apache.lucene.analysis.TokenStream;
	13	+import org.wikimedia.lsearch.config.IndexId;
13	14
14	15	/**
15	16	* Analyzer that builds a field with an array of keywords,
—	—	@@ -28,6 +29,7 @@
29	30	static Logger log = Logger.getLogger(KeywordsAnalyzer.class);
30	31	protected KeywordsTokenStream[] tokensBySize = null;
31	32	protected String prefix;
	33	+ protected IndexId iid;
32	34
33	35	/** number of field to be generated, e.g. keyword1 for single-word keywords,
34	36	* keyword2 for two-word keywords, etc ... the last field has all the remaining keys
—	—	@@ -50,6 +52,7 @@
51	53
52	54	protected void init(ArrayList<String> keywords, FilterFactory filters, String prefix, boolean exactCase) {
53	55	this.prefix = prefix;
	56	+ this.iid = filters.getIndexId();
54	57	tokensBySize = new KeywordsTokenStream[KEYWORD_LEVELS];
55	58	if(keywords == null){
56	59	// init empty token streams
—	—	@@ -63,7 +66,7 @@
64	67	keywordsBySize.add(new ArrayList<String>());
65	68	// arange keywords into a list by token number
66	69	for(String k : keywords){
67		~~- ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,filters.getLanguage(),exactCase).parse();~~
	70	+ ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,exactCase).parse();
68	71	if(parsed.size() == 0)
69	72	continue;
70	73	else if(parsed.size() < KEYWORD_LEVELS)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java
—	—	@@ -11,6 +11,7 @@
12	12	import org.apache.lucene.analysis.de.GermanStemFilter;
13	13	import org.apache.lucene.analysis.snowball.SnowballFilter;
14	14	import org.apache.lucene.analysis.th.ThaiWordFilter;
	15	+import org.wikimedia.lsearch.config.IndexId;
15	16
16	17	/**
17	18	* Make a language-dependent pair of filters. The custom filter is to be applied before the stemmer.
—	—	@@ -20,6 +21,7 @@
21	22	*/
22	23	public class FilterFactory {
23	24	protected String lang;
	25	+ protected IndexId iid;
24	26	protected String snowballName = null;
25	27	protected boolean useStemmer,useLangFilter;
26	28	protected Class stemmer = null;
—	—	@@ -33,18 +35,20 @@
34	36	public enum Type { FULL, NO_STEM, SPELL_CHECK };
35	37	protected Type type = null;
36	38
37		~~- public FilterFactory(String lang){~~
38		~~- this(lang,Type.FULL);~~
	39	+ public FilterFactory(IndexId iid){
	40	+ this(iid,Type.FULL);
39	41	}
40	42
41		~~- public FilterFactory(String lang, Type type){~~
42		~~- this.lang = lang;~~
	43	+ public FilterFactory(IndexId iid, Type type){
	44	+ this.lang = iid.getLangCode();
	45	+ this.iid = iid;
43	46	this.type = type;
44	47	init();
45		~~- noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useLangFilter,null,langFilter,additionalFilters);~~
	48	+ noStemmerFilterFactory = new FilterFactory(iid,lang,snowballName,false,useLangFilter,null,langFilter,additionalFilters);
46	49	}
47	50
48		~~- public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useLangFilter, Class stemmer, Class langFilter, ArrayList<Class> additionalFilters) {~~
	51	+ public FilterFactory(IndexId iid, String lang, String snowballName, boolean useStemmer, boolean useLangFilter, Class stemmer, Class langFilter, ArrayList<Class> additionalFilters) {
	52	+ this.iid = iid;
49	53	this.lang = lang;
50	54	this.snowballName = snowballName;
51	55	this.useStemmer = useStemmer;
—	—	@@ -193,6 +197,12 @@
194	198	public void setStopWords(Set<String> stopWords){
195	199	this.stopWords = stopWords;
196	200	}
	201	+
	202	+ public IndexId getIndexId() {
	203	+ return iid;
	204	+ }
197	205
198	206
	207	+
	208	+
199	209	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java
—	—	@@ -25,7 +25,7 @@
26	26	*/
27	27	@Override
28	28	public TokenStream tokenStream(String fieldName, String text) {
29		~~- wikitokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase);~~
	29	+ wikitokenizer = new WikiTokenizer(text,filters.getIndexId(),exactCase);
30	30	return super.tokenStream(fieldName,(Reader)null);
31	31	}
32	32
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/RelatedAnalyzer.java
—	—	@@ -21,6 +21,7 @@
22	22
23	23	public RelatedAnalyzer(ArrayList<RelatedTitle> related, int[] p, FilterFactory filters, String prefix, boolean exactCase) {
24	24	this.prefix = prefix;
	25	+ this.iid = filters.getIndexId();
25	26	tokensBySize = new KeywordsTokenStream[RELATED_GROUPS];
26	27	if(related == null \|\| p == null){
27	28	// init empty token streams
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ContextAnalyzer.java
—	—	@@ -0,0 +1,60 @@
	2	+package org.wikimedia.lsearch.analyzers;
	3	+
	4	+import java.io.IOException;
	5	+import java.util.ArrayList;
	6	+import java.util.Collection;
	7	+
	8	+import org.wikimedia.lsearch.analyzers.KeywordsAnalyzer.KeywordsTokenStream;
	9	+import org.wikimedia.lsearch.beans.Title;
	10	+import org.wikimedia.lsearch.ranks.Links;
	11	+import org.wikimedia.lsearch.related.RelatedTitle;
	12	+
	13	+/**
	14	+ * Contexts tokenized, with token gaps
	15	+ *
	16	+ * @author rainman
	17	+ *
	18	+ */
	19	+public class ContextAnalyzer extends KeywordsAnalyzer {
	20	+ static public int CONTEXT_GROUPS = 2;
	21	+
	22	+ static public int TOKEN_GAP = 100;
	23	+
	24	+ public ContextAnalyzer(Title title, Links links, ArrayList<RelatedTitle> related, int[] p, FilterFactory filters, String prefix, boolean exactCase) {
	25	+ this.prefix = prefix;
	26	+ this.iid = filters.getIndexId();
	27	+ tokensBySize = new KeywordsTokenStream[CONTEXT_GROUPS];
	28	+ if(related == null \|\| p == null \|\| title == null \|\| links == null){
	29	+ // init empty token streams
	30	+ for(int i=0; i< CONTEXT_GROUPS; i++){
	31	+ tokensBySize[i] = new KeywordsTokenStream(null,filters,exactCase,TOKEN_GAP);
	32	+ }
	33	+ return;
	34	+ }
	35	+ String key = title.getKey();
	36	+ // split-up
	37	+ ArrayList<ArrayList<String>> partitions = new ArrayList<ArrayList<String>>();
	38	+ for(int i=0;i<CONTEXT_GROUPS;i++){
	39	+ ArrayList<String> part = new ArrayList<String>();
	40	+ for(int j=p[i];j<p[i+1];j++){
	41	+ Title t = related.get(j).getRelated();
	42	+ Collection<String> contexts;
	43	+ try {
	44	+ contexts = links.getContext(t.getKey(),key);
	45	+ //System.out.println("CONTEXT "+t.getKey()+" -> "+key+" : "+contexts);
	46	+ if(contexts != null)
	47	+ part.addAll(contexts);
	48	+ } catch (IOException e) {
	49	+ log.warn("Cannot fetch context for "+key+" from "+t.getKey()+" : "+e.getMessage());
	50	+ e.printStackTrace();
	51	+ }
	52	+
	53	+ }
	54	+ partitions.add(part);
	55	+ }
	56	+ for(int i=0; i< CONTEXT_GROUPS; i++){
	57	+ tokensBySize[i] = new KeywordsTokenStream(partitions.get(i),filters,exactCase,TOKEN_GAP);
	58	+ }
	59	+ }
	60	+
	61	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java
—	—	@@ -1,5 +1,7 @@
2	2	package org.wikimedia.lsearch.analyzers;
3	3
	4	+import org.wikimedia.lsearch.config.IndexId;
	5	+
4	6	/**
5	7	* Agregate class for FilterFactory and FieldNameFactory. This class
6	8	* contains methods used to build various fields of the index,
—	—	@@ -47,15 +49,15 @@
48	50	public static enum Options { NONE, SPELL_CHECK };
49	51
50	52	/** Construct case-insensitive field builder with stemming */
51		~~- public FieldBuilder(String lang){~~
52		~~- this(lang,Case.IGNORE_CASE,Stemmer.USE_STEMMER,Options.NONE);~~
	53	+ public FieldBuilder(IndexId iid){
	54	+ this(iid,Case.IGNORE_CASE,Stemmer.USE_STEMMER,Options.NONE);
53	55	}
54	56
55		~~- public FieldBuilder(String lang, Case useCase){~~
56		~~- this(lang,useCase,Stemmer.USE_STEMMER,Options.NONE);~~
	57	+ public FieldBuilder(IndexId iid, Case useCase){
	58	+ this(iid,useCase,Stemmer.USE_STEMMER,Options.NONE);
57	59	}
58	60
59		~~- public FieldBuilder(String lang, Case useCase, Stemmer useStemmer, Options options){~~
	61	+ public FieldBuilder(IndexId iid, Case useCase, Stemmer useStemmer, Options options){
60	62	FilterFactory.Type type = FilterFactory.Type.FULL;
61	63	if(options == Options.SPELL_CHECK)
62	64	type = FilterFactory.Type.SPELL_CHECK;
—	—	@@ -63,7 +65,7 @@
64	66	if(useCase == Case.EXACT_CASE){
65	67	builders = new BuilderSet[2];
66	68	builders[1] = new BuilderSet(
67		~~- new FilterFactory(lang,type).getNoStemmerFilterFactory(),~~
	69	+ new FilterFactory(iid,type).getNoStemmerFilterFactory(),
68	70	new FieldNameFactory(FieldNameFactory.EXACT_CASE));
69	71	} else
70	72	builders = new BuilderSet[1];
—	—	@@ -71,11 +73,11 @@
72	74	// default factory, lowercase all data
73	75	if(useStemmer == Stemmer.USE_STEMMER){
74	76	builders[0] = new BuilderSet(
75		~~- new FilterFactory(lang,type),~~
	77	+ new FilterFactory(iid,type),
76	78	new FieldNameFactory());
77	79	} else{
78	80	builders[0] = new BuilderSet(
79		~~- new FilterFactory(lang,type).getNoStemmerFilterFactory(),~~
	81	+ new FilterFactory(iid,type).getNoStemmerFilterFactory(),
80	82	new FieldNameFactory());
81	83	}
82	84
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java
—	—	@@ -10,6 +10,7 @@
11	11	import org.apache.log4j.Logger;
12	12	import org.apache.lucene.analysis.Token;
13	13	import org.apache.lucene.analysis.Tokenizer;
	14	+import org.wikimedia.lsearch.config.IndexId;
14	15
15	16	/** Uses FastWikiTokenizerEngine to tokenize text */
16	17	public class WikiTokenizer extends Tokenizer {
—	—	@@ -36,8 +37,8 @@
37	38	* @param str
38	39	*/
39	40
40		~~- public WikiTokenizer(String str, String lang, boolean exactCase){~~
41		~~- parser = new FastWikiTokenizerEngine(str,lang,exactCase);~~
	41	+ public WikiTokenizer(String str, IndexId iid, boolean exactCase){
	42	+ parser = new FastWikiTokenizerEngine(str,iid,exactCase);
42	43	this.input = null;
43	44	}
44	45
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SplitAnalyzer.java
—	—	@@ -10,7 +10,7 @@
11	11	import org.apache.lucene.analysis.Tokenizer;
12	12	import org.wikimedia.lsearch.ranks.StringList;
13	13
14		~~-/** Split the text by some specific char */~~
	14	+/** Analyzes serialized StringLists into its components */
15	15	public class SplitAnalyzer extends Analyzer {
16	16	class SplitTokenStream extends Tokenizer {
17	17	Iterator<String> it = null;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
—	—	@@ -9,6 +9,7 @@
10	10
11	11	import org.apache.commons.lang.WordUtils;
12	12	import org.apache.lucene.analysis.Token;
	13	+import org.wikimedia.lsearch.config.IndexId;
13	14	import org.wikimedia.lsearch.util.Localization;
14	15	import org.wikimedia.lsearch.util.UnicodeDecomposer;
15	16
—	—	@@ -67,6 +68,7 @@
68	69
69	70	/** language code */
70	71	private String language;
	72	+ private IndexId iid;
71	73	/** language code -> set (image namespace names) */
72	74	private static Hashtable<String,HashSet<String>> imageLocalized = new Hashtable<String,HashSet<String>>();
73	75	/** language code -> set (category namespace names) */
—	—	@@ -111,10 +113,11 @@
112	114	}
113	115	}
114	116
115		~~- public FastWikiTokenizerEngine(String text, String lang, boolean exactCase){~~
	117	+ public FastWikiTokenizerEngine(String text, IndexId iid, boolean exactCase){
116	118	this.text = text.toCharArray();
117	119	this.textString = text;
118		~~- this.language = lang;~~
	120	+ this.language = iid.getLangCode();
	121	+ this.iid = iid;
119	122	this.exactCase = exactCase;
120	123	textLength = text.length();
121	124	init();
—	—	@@ -744,7 +747,7 @@
745	748	else if(language!=null && language.length()!=0){
746	749	HashSet<String> loc = imageLocalized.get(language);
747	750	if(loc == null){
748		~~- loc = Localization.getLocalizedImage(language);~~
	751	+ loc = Localization.getLocalizedImage(language,iid.getDBname());
749	752	imageLocalized.put(language,loc);
750	753	}
751	754	if(loc.contains(prefix))
—	—	@@ -761,7 +764,7 @@
762	765	else if(language!=null && language.length()!=0){
763	766	HashSet<String> loc = categoryLocalized.get(language);
764	767	if(loc == null){
765		~~- loc = Localization.getLocalizedCategory(language);~~
	768	+ loc = Localization.getLocalizedCategory(language,iid.getDBname());
766	769	categoryLocalized.put(language,loc);
767	770	}
768	771	if(loc.contains(prefix))
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Localization.java
—	—	@@ -8,6 +8,7 @@
9	9	import java.util.HashMap;
10	10	import java.util.Hashtable;
11	11	import java.util.HashSet;
	12	+import java.util.Map;
12	13	import java.util.Map.Entry;
13	14
14	15	import org.apache.log4j.Logger;
—	—	@@ -28,9 +29,16 @@
29	30	protected static Object lock = new Object();
30	31	/** Languages for which loading of localization failed */
31	32	protected static HashSet<String> badLocalizations = new HashSet<String>();
	33	+ /** Languages for which we loaded localication */
	34	+ protected static HashSet<String> loadedLocalizations = new HashSet<String>();
32	35	protected static HashSet<String> interwiki = null;
33	36	/** lowecased canonical names of namespaces */
34		~~- protected static Hashtable<String,Integer> canonicalNamespaces = new Hashtable<String,Integer>();~~
	37	+ protected static Hashtable<String,Integer> canonicalNamespaces = new Hashtable<String,Integer>();
	38	+ /** dbname -> meta namespace name */
	39	+ protected static Hashtable<String,String> metaNamespaces = new Hashtable<String,String>();
	40	+ /** custom maps (for oai headers, etc..) dbname -> nsname -> nsindex */
	41	+ protected static Hashtable<String,Hashtable<String,Integer>> customNamespaces = new Hashtable<String,Hashtable<String,Integer>>();
	42	+
35	43	static{
36	44	canonicalNamespaces.put("media",-2);
37	45	canonicalNamespaces.put("special",-1);
—	—	@@ -51,48 +59,72 @@
52	60	canonicalNamespaces.put("category_talk",15);
53	61	}
54	62
	63	+ /** set meta namespaces for specific db names */
	64	+ public static void setMetaNamespace(Map<String,String> dbmeta){
	65	+ synchronized(lock){
	66	+ metaNamespaces.putAll(dbmeta);
	67	+ }
	68	+ }
	69	+
55	70	/** Add custom mapping not found in localization files from other source, e.g. project name, etc.. */
56		~~- public static void addCustomMapping(String namespace, int index, String langCode){~~
	71	+ public static void addCustomMapping(String namespace, int index, String dbname){
57	72	synchronized(lock){
58		~~- Hashtable<String,Integer> map = namespaces.get(langCode);~~
	73	+ Hashtable<String,Integer> map = customNamespaces.get(dbname);
59	74	if(map == null){
60	75	map = new Hashtable<String,Integer>();
61		~~- namespaces.put(langCode,map);~~
	76	+ customNamespaces.put(dbname,map);
62	77	}
63	78	map.put(namespace.toLowerCase(),index);
64	79	}
65	80	}
66		-
67		~~- public static HashSet<String> getLocalizedImage(String langCode){~~
68		~~- return getLocalizedNamespace(langCode,6);~~
	81	+ /** Get a new hashset of localized image namespace names */
	82	+ public static HashSet<String> getLocalizedImage(String langCode, String dbname){
	83	+ return getLocalizedNamespace(langCode,6,dbname);
69	84	}
70		-
71		~~- public static HashSet<String> getLocalizedCategory(String langCode){~~
72		~~- return getLocalizedNamespace(langCode,14);~~
	85	+ /** Get a new hashset of localized category namespace names */
	86	+ public static HashSet<String> getLocalizedCategory(String langCode, String dbname){
	87	+ return getLocalizedNamespace(langCode,14,dbname);
73	88	}
74	89
75		~~- public static HashSet<String> getLocalizedNamespace(String langCode, int nsId){~~
	90	+ public static HashSet<String> getLocalizedNamespace(String langCode, int nsId, String dbname){
76	91	synchronized (lock){
	92	+ HashSet<String> res = new HashSet<String>();
77	93	langCode = langCode.toLowerCase();
78		~~- if(namespaces.get(langCode)==null){~~
79		~~- if(badLocalizations.contains(langCode) \|\| !readLocalization(langCode))~~
80		~~- return new HashSet<String>();~~
	94	+ if(namespaces.get(langCode)==null)
	95	+ readLocalization(langCode);
	96	+
	97	+ // get namespaces from message files
	98	+ res.addAll(collect(namespaces.get(langCode),nsId));
	99	+ // get db-specific names, like meta namespaces or ones obtained via oai or other ways
	100	+ if(dbname != null){
	101	+ res.addAll(collect(customNamespaces.get(dbname),nsId));
	102	+ if(nsId == 4 && metaNamespaces.containsKey(dbname))
	103	+ res.add(metaNamespaces.get(dbname));
81	104	}
82		~~- return collect(namespaces.get(langCode),nsId);~~
	105	+ return res;
83	106	}
84	107	}
85	108
86	109	/** Get mapping namespace_name (lowercase) -> namespace_index */
87		~~- public static HashMap<String,Integer> getLocalizedNamespaces(String langCode){~~
	110	+ public static HashMap<String,Integer> getLocalizedNamespaces(String langCode, String dbname){
88	111	synchronized (lock){
89	112	HashMap<String,Integer> ret = new HashMap<String,Integer>();
90	113	ret.putAll(canonicalNamespaces);
91	114	langCode = langCode.toLowerCase();
92		~~- if(namespaces.get(langCode)==null){~~
93		~~- if(badLocalizations.contains(langCode) \|\| !readLocalization(langCode))~~
94		~~- return ret;~~
	115	+ if(namespaces.get(langCode)==null)
	116	+ readLocalization(langCode);
	117	+ // localization from messages files
	118	+ if(namespaces.containsKey(langCode))
	119	+ ret.putAll(namespaces.get(langCode));
	120	+ // db-specific
	121	+ if(dbname != null){
	122	+ // meta namespaces
	123	+ if(metaNamespaces.containsKey(dbname))
	124	+ ret.put(metaNamespaces.get(dbname),4);
	125	+ // custom
	126	+ if(customNamespaces.containsKey(dbname))
	127	+ ret.putAll(customNamespaces.get(dbname));
95	128	}
96		~~- ret.putAll(namespaces.get(langCode));~~
97	129	return ret;
98	130	}
99	131	}
—	—	@@ -107,6 +139,8 @@
108	140	/** Collect all the names with some certain namespace id */
109	141	protected static HashSet<String> collect(Hashtable<String,Integer> ns, int nsid) {
110	142	HashSet<String> ret = new HashSet<String>();
	143	+ if(ns == null)
	144	+ return ret;
111	145	for(Entry<String,Integer> e : ns.entrySet()){
112	146	if(e.getValue().intValue() == nsid)
113	147	ret.add(e.getKey());
—	—	@@ -123,6 +157,10 @@
124	158	/** Level is recursion level (to detect infinite recursion if language
125	159	* defines itself as a fallback) */
126	160	protected static boolean readLocalization(String langCode, int level){
	161	+ if(badLocalizations.contains(langCode))
	162	+ return false; // failed previously
	163	+ if(loadedLocalizations.contains(langCode))
	164	+ return true; // already loaded
127	165	Configuration config = Configuration.open();
128	166	if(langCode == null \|\| langCode.equals(""))
129	167	return false;
—	—	@@ -158,6 +196,7 @@
159	197	if(ns!=null && ns.size()!=0){
160	198	namespaces.put(langCode.toLowerCase(),ns);
161	199	log.debug("Succesfully loaded localization for "+langCode.toLowerCase());
	200	+ loadedLocalizations.add(langCode);
162	201	return true;
163	202	} else{ // maybe a fallback language is defines instead
164	203	String fallback = parser.getFallBack(text);
—	—	@@ -165,6 +204,7 @@
166	205	fallback = fallback.replace('-','_');
167	206	boolean succ = readLocalization(fallback,level+1);
168	207	if(succ){
	208	+ loadedLocalizations.add(fallback);
169	209	namespaces.put(langCode.toLowerCase(),namespaces.get(fallback.toLowerCase()));
170	210	redirects.put(langCode.toLowerCase(),redirects.get(fallback.toLowerCase()));
171	211	}
—	—	@@ -216,9 +256,13 @@
217	257	int end = line.indexOf("]]");
218	258	if(begin != -1 && end != -1 && end > begin){
219	259	String redirectText = text.substring(begin+2,end);
	260	+ int pipe = redirectText.indexOf('\|');
	261	+ if(pipe != -1)
	262	+ redirectText = redirectText.substring(0,pipe);
220	263	int fragment = redirectText.lastIndexOf('#');
221	264	if(fragment != -1)
222	265	redirectText = redirectText.substring(0,fragment);
	266	+ redirectText = redirectText.replace('_',' ');
223	267	return redirectText;
224	268	}
225	269	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/FSUtils.java
—	—	@@ -0,0 +1,139 @@
	2	+package org.wikimedia.lsearch.util;
	3	+
	4	+import java.io.File;
	5	+import java.io.IOException;
	6	+
	7	+/**
	8	+ * Various abstraction of file system operations: delete dirs,
	9	+ * make soft/hard links ...
	10	+ *
	11	+ * Based on FileUtil.java from Lucene Hadoop project (Apache Licence)
	12	+ * @author rainman
	13	+ *
	14	+ */
	15	+public class FSUtils {
	16	+ public static final String PATH_SEP = System.getProperty("file.separator");
	17	+
	18	+ enum OSType { OS_TYPE_UNIX, OS_TYPE_WINXP };
	19	+
	20	+ protected static String[] hardLinkCommand;
	21	+
	22	+ static {
	23	+ switch(getOSType()) {
	24	+ case OS_TYPE_WINXP:
	25	+ hardLinkCommand = new String[] {"fsutil","hardlink","create", null, null};
	26	+ break;
	27	+ case OS_TYPE_UNIX:
	28	+ default:
	29	+ hardLinkCommand = new String[] {"ln", null, null};
	30	+ }
	31	+ }
	32	+
	33	+ static OSType getOSType() {
	34	+ String osName = System.getProperty("os.name");
	35	+ if (osName.indexOf("Windows") >= 0 &&
	36	+ (osName.indexOf("XP") >= 0 \|\| osName.indexOf("2003") >= 0))
	37	+ return OSType.OS_TYPE_WINXP;
	38	+ else
	39	+ return OSType.OS_TYPE_UNIX;
	40	+ }
	41	+
	42	+ /**
	43	+ * Create a hardlink in the filesystem.
	44	+ *
	45	+ * @param target
	46	+ * @param linkName
	47	+ * @throws IOException
	48	+ */
	49	+ public static void createHardLink(File target, File linkName) throws IOException {
	50	+ int len = hardLinkCommand.length;
	51	+ hardLinkCommand[len-2] = target.getCanonicalPath();
	52	+ hardLinkCommand[len-1] = linkName.getCanonicalPath();
	53	+ Command.exec(hardLinkCommand);
	54	+ }
	55	+
	56	+ /**
	57	+ * Create hard links recursively if the target is a directory
	58	+ *
	59	+ * @param target
	60	+ * @param linkname
	61	+ * @throws IOException
	62	+ */
	63	+ public static void createHardLinkRecursive(String target, String linkname) throws IOException {
	64	+ File file = new File(target);
	65	+ if(!file.exists())
	66	+ throw new IOException("Trying to hardlink nonexisting file "+target);
	67	+ if(file.isDirectory()){
	68	+ File[] files = file.listFiles();
	69	+ for(File f: files)
	70	+ createHardLinkRecursive(format(new String[]{target,f.getName()}),format(new String[] {linkname,f.getName()}));
	71	+ } else
	72	+ createHardLink(new File(target),new File(linkname));
	73	+ }
	74	+
	75	+
	76	+ /**
	77	+ * Create a soft link between a src and destination
	78	+ * only on a local disk. HDFS does not support this
	79	+ * @param target the target for symlink
	80	+ * @param linkname the symlink
	81	+ */
	82	+ public static void createSymLink(String target, String linkname) throws IOException{
	83	+ String cmd = "ln -s " + target + " " + linkname;
	84	+ Command.exec(cmd);
	85	+ }
	86	+
	87	+ /**
	88	+ * Append path parts via the systems path separator.
	89	+ * I.e. {"/usr/local", "search" } -> "/usr/local/search"
	90	+ * @param parts
	91	+ */
	92	+ public static String format(String[] parts){
	93	+ StringBuilder sb = new StringBuilder();
	94	+ boolean first = true;
	95	+ for(String p : parts){
	96	+ if(!first && p.startsWith(PATH_SEP))
	97	+ p = p.substring(PATH_SEP.length());
	98	+ sb.append(p);
	99	+ if(!p.endsWith(PATH_SEP))
	100	+ sb.append(PATH_SEP);
	101	+ if(first)
	102	+ first = false;
	103	+ }
	104	+ return sb.toString();
	105	+ }
	106	+
	107	+ /**
	108	+ * Construct a file from parts of path
	109	+ * @param parts
	110	+ */
	111	+ public static File formatFile(String[] parts){
	112	+ return new File(format(parts));
	113	+ }
	114	+
	115	+ /**
	116	+ * Delete a file recursively
	117	+ *
	118	+ * @param file
	119	+ */
	120	+ public static void deleteRecursive(File file){
	121	+ if(!file.exists())
	122	+ return;
	123	+ else if(file.isDirectory()){
	124	+ File[] files = file.listFiles();
	125	+ for(File f: files)
	126	+ deleteRecursive(f);
	127	+ file.delete();
	128	+ } else{
	129	+ file.delete();
	130	+ }
	131	+ }
	132	+
	133	+ /** Delete single file */
	134	+ public static void delete(String path) {
	135	+ File f = new File(path);
	136	+ if(f.exists()) // if doesn't exist don't complain
	137	+ f.delete();
	138	+ }
	139	+
	140	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Command.java
—	—	@@ -24,10 +24,17 @@
25	25	}
26	26
27	27	public static void exec(String command) throws IOException {
	28	+ exec(new String[] {command});
	29	+ }
	30	+
	31	+ public static void exec(String[] command) throws IOException {
28	32	Process p = null;
29	33	log.debug("Executing shell command "+command);
30	34	try {
31		~~- p = Runtime.getRuntime().exec(command);~~
	35	+ if(command.length == 1)
	36	+ p = Runtime.getRuntime().exec(command[0]);
	37	+ else
	38	+ p = Runtime.getRuntime().exec(command);
32	39	p.waitFor();
33	40	if(p.exitValue()!=0){
34	41	log.warn("Got exit value "+p.exitValue()+" while executing "+command);
—	—	@@ -43,6 +50,8 @@
44	51	throw new IOException("Interrupted");
45	52	} finally {
46	53	closeStreams(p);
	54	+ if(p != null)
	55	+ p.destroy();
47	56	}
48	57	}
49	58
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/PHPParser.java
—	—	@@ -162,6 +162,24 @@
163	163	return servers;
164	164	}
165	165
	166	+ /** Get wgMetaNamespace (dbname->metans name) from InitialiseSettings */
	167	+ public Hashtable<String,String> getMetaNamespace(String text){
	168	+ text = text.replaceAll("(#.*)",""); // strip comments
	169	+ Hashtable<String,String> meta = new Hashtable<String,String>();
	170	+
	171	+ int flags = Pattern.CASE_INSENSITIVE \| Pattern.DOTALL;
	172	+ Pattern wgmeta = Pattern.compile("[\"']wgMetaNamespace[\"']\\s=>\\sarray\\s\$(.?)\$",flags);
	173	+ Pattern entry = Pattern.compile("[\"'](.?)[\"']\\s=>\\s[\"'](.?)[\"']",flags);
	174	+ Matcher matcher = wgmeta.matcher(text);
	175	+ while(matcher.find()){
	176	+ Matcher me = entry.matcher(matcher.group(1));
	177	+ while(me.find()){
	178	+ meta.put(me.group(1),me.group(2));
	179	+ }
	180	+ }
	181	+ return meta;
	182	+ }
	183	+
166	184	/** Get wgNamespacesToBeSearchedDefault from InitialiseSettings */
167	185	public Hashtable<String,NamespaceFilter> getDefaultSearch(String text){
168	186	text = text.replaceAll("(#.*)",""); // strip comments
—	—	@@ -276,6 +294,7 @@
277	295	System.out.println(p.getLanguages(initset));
278	296	System.out.println(p.getServer(initset));
279	297	System.out.println(p.getDefaultSearch(initset));
	298	+ System.out.println(p.getMetaNamespace(initset));
280	299
281	300
282	301	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/TitleReader.java
—	—	@@ -1,67 +0,0 @@
2		~~-package org.wikimedia.lsearch.ranks;~~
3		-
4		~~-import java.io.IOException;~~
5		~~-import java.util.ArrayList;~~
6		~~-import java.util.HashMap;~~
7		~~-import java.util.HashSet;~~
8		~~-import java.util.Iterator;~~
9		~~-import java.util.Map.Entry;~~
10		-
11		~~-import org.mediawiki.importer.DumpWriter;~~
12		~~-import org.mediawiki.importer.Page;~~
13		~~-import org.mediawiki.importer.Revision;~~
14		~~-import org.mediawiki.importer.Siteinfo;~~
15		~~-import org.wikimedia.lsearch.beans.ArticleLinks;~~
16		~~-import org.wikimedia.lsearch.beans.Title;~~
17		~~-import org.wikimedia.lsearch.config.IndexId;~~
18		~~-import org.wikimedia.lsearch.util.Localization;~~
19		-
20		-/**
21		~~- * Read a HashSet of titles from dump~~
22		- *
23		~~- * @author rainman~~
24		- *
25		~~- */~~
26		~~-public class TitleReader implements DumpWriter{~~
27		~~- Page page;~~
28		~~- Revision revision;~~
29		~~- Links links;~~
30		~~- protected String langCode;~~
31		-
32		~~- public TitleReader(String langCode, IndexId iid) throws IOException{~~
33		~~- this.langCode = langCode;~~
34		~~- this.links = Links.createNew(iid);~~
35		~~- }~~
36		-
37		~~- public void writeRevision(Revision revision) throws IOException {~~
38		~~- this.revision = revision;~~
39		~~- }~~
40		~~- public void writeStartPage(Page page) throws IOException {~~
41		~~- this.page = page;~~
42		~~- }~~
43		~~- public void writeEndPage() throws IOException {~~
44		~~- String key = page.Title.Namespace+":"+page.Title.Text;~~
45		~~- links.addTitle(new Title(key));~~
46		~~- }~~
47		~~- public Links getLinks() {~~
48		~~- return links;~~
49		~~- }~~
50		~~- public void close() throws IOException {~~
51		~~- // nop~~
52		~~- }~~
53		~~- public void writeEndWiki() throws IOException {~~
54		~~- // nop~~
55		~~- }~~
56		~~- public void writeSiteinfo(Siteinfo info) throws IOException {~~
57		~~- // write siteinfo to localization~~
58		~~- Iterator it = info.Namespaces.orderedEntries();~~
59		~~- while(it.hasNext()){~~
60		~~- Entry<Integer,String> pair = (Entry<Integer,String>)it.next();~~
61		~~- Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);~~
62		~~- links.addToNamespaceMap(pair.getValue(),pair.getKey());~~
63		~~- }~~
64		~~- }~~
65		~~- public void writeStartWiki() throws IOException {~~
66		~~- // nop~~
67		~~- }~~
68		-}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java
—	—	@@ -3,6 +3,8 @@
4	4	import java.io.IOException;
5	5	import java.util.HashMap;
6	6	import java.util.HashSet;
	7	+import java.util.Iterator;
	8	+import java.util.Map.Entry;
7	9	import java.util.regex.Matcher;
8	10	import java.util.regex.Pattern;
9	11
—	—	@@ -35,12 +37,14 @@
36	38	Links links;
37	39	HashSet<String> interwiki;
38	40	String langCode;
	41	+ IndexId iid;
39	42
40		~~- public LinkReader(Links links, String langCode){~~
	43	+ public LinkReader(Links links, IndexId iid, String langCode){
41	44	this.links = links;
42	45	if(langCode == null \|\| langCode.equals(""))
43	46	langCode = "en";
44	47	this.langCode = langCode;
	48	+ this.iid = iid;
45	49	interwiki = Localization.getInterwiki();
46	50	}
47	51	public void writeRevision(Revision revision) throws IOException {
—	—	@@ -50,10 +54,23 @@
51	55	this.page = page;
52	56	}
53	57	public void writeEndPage() throws IOException {
54		~~- links.addArticleInfo(revision.Text,new Title(page.Title.Namespace,page.Title.Text));~~
	58	+ Title t = new Title(page.Title.Namespace,page.Title.Text);
	59	+ try{
	60	+ links.addArticleInfo(revision.Text,t);
	61	+ } catch(Exception e){
	62	+ log.error("Error adding article "+t+" : "+e.getMessage());
	63	+ e.printStackTrace();
	64	+ }
55	65	}
56	66	public void writeSiteinfo(Siteinfo info) throws IOException {
57	67	siteinfo = info;
	68	+ // write siteinfo to localization
	69	+ Iterator it = info.Namespaces.orderedEntries();
	70	+ while(it.hasNext()){
	71	+ Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
	72	+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),iid.getDBname());
	73	+ links.addToNamespaceMap(pair.getValue(),pair.getKey());
	74	+ }
58	75	}
59	76	public void close() throws IOException {
60	77	// nop
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java
—	—	@@ -1,6 +1,11 @@
2	2	package org.wikimedia.lsearch.ranks;
3	3
	4	+import java.io.ByteArrayInputStream;
	5	+import java.io.ByteArrayOutputStream;
4	6	import java.io.IOException;
	7	+import java.io.ObjectInputStream;
	8	+import java.io.ObjectOutputStream;
	9	+import java.io.StringWriter;
5	10	import java.util.ArrayList;
6	11	import java.util.Collection;
7	12	import java.util.HashMap;
—	—	@@ -15,6 +20,9 @@
16	21	import org.apache.lucene.analysis.SimpleAnalyzer;
17	22	import org.apache.lucene.document.Document;
18	23	import org.apache.lucene.document.Field;
	24	+import org.apache.lucene.document.FieldSelector;
	25	+import org.apache.lucene.document.SetBasedFieldSelector;
	26	+import org.apache.lucene.index.CorruptIndexException;
19	27	import org.apache.lucene.index.IndexReader;
20	28	import org.apache.lucene.index.IndexWriter;
21	29	import org.apache.lucene.index.Term;
—	—	@@ -29,6 +37,7 @@
30	38	import org.wikimedia.lsearch.config.IndexId;
31	39	import org.wikimedia.lsearch.index.WikiIndexModifier;
32	40	import org.wikimedia.lsearch.related.CompactArticleLinks;
	41	+import org.wikimedia.lsearch.search.NamespaceFilter;
33	42	import org.wikimedia.lsearch.spell.api.Dictionary;
34	43	import org.wikimedia.lsearch.spell.api.LuceneDictionary;
35	44	import org.wikimedia.lsearch.spell.api.Dictionary.Word;
—	—	@@ -40,61 +49,93 @@
41	50	protected String langCode;
42	51	protected IndexWriter writer = null;
43	52	protected HashMap<String,Integer> nsmap = null;
44		~~- protected HashSet<String> interwiki = new HashSet<String>();~~
	53	+ protected HashSet<String> interwiki;
	54	+ protected HashSet<String> categoryLocalized;
	55	+ protected HashSet<String> imageLocalized;
45	56	protected IndexReader reader = null;
46	57	protected String path;
47		~~- protected enum State { MODIFIED_TITLES, FLUSHED, MODIFIED_ARTICLES, READ };~~
	58	+ protected enum State { FLUSHED, WRITE, MODIFIED, READ };
48	59	protected State state;
49		~~- protected Directory directory;~~
	60	+ protected Directory directory = null;
	61	+ protected NamespaceFilter nsf; // default search
	62	+ protected ObjectCache cache;
	63	+ //protected ObjectCache refCache;
	64	+ protected FieldSelector keyOnly,redirectOnly,contextOnly,linksOnly;
50	65
51		~~- private Links(IndexId iid){~~
	66	+ private Links(IndexId iid, String path, IndexWriter writer) throws CorruptIndexException, IOException{
	67	+ this.writer = writer;
	68	+ this.path = path;
52	69	this.iid = iid;
53		~~- this.langCode = GlobalConfiguration.getInstance().getLanguage(iid);~~
	70	+ GlobalConfiguration global = GlobalConfiguration.getInstance();
	71	+ this.langCode = global.getLanguage(iid);
	72	+ String dbname = iid.getDBname();
	73	+ nsmap = Localization.getLocalizedNamespaces(langCode,dbname);
	74	+ interwiki = Localization.getInterwiki();
	75	+ categoryLocalized = Localization.getLocalizedCategory(langCode,dbname);
	76	+ imageLocalized = Localization.getLocalizedImage(langCode,dbname);
	77	+ state = State.FLUSHED;
	78	+ initWriter(writer);
	79	+ //reader = IndexReader.open(path);
	80	+ nsf = global.getDefaultNamespace(iid);
	81	+ cache = new ObjectCache(10000);
	82	+ // init cache manager
	83	+ /*CacheManager manager = CacheManager.create();
	84	+ cache = new Cache("links", 5000, false, false, 5, 2);
	85	+ manager.addCache(cache); */
	86	+ keyOnly = makeSelector("article_key");
	87	+ redirectOnly = makeSelector("redirect");
	88	+ contextOnly = makeSelector("context");
	89	+ linksOnly = makeSelector("links");
54	90	}
55	91
56		~~- public static Links openExisting(IndexId iid) throws IOException{~~
57		~~- Links links = new Links(iid);~~
58		~~- links.path = iid.getTempPath();~~
59		~~- log.info("Using index at "+links.path);~~
60		~~- links.writer = WikiIndexModifier.openForWrite(links.path,false);~~
61		~~- initWriter(links.writer);~~
62		~~- links.reader = IndexReader.open(links.path);~~
63		~~- links.nsmap = Localization.getLocalizedNamespaces(links.langCode);~~
64		~~- links.interwiki = Localization.getInterwiki();~~
65		~~- links.state = State.FLUSHED;~~
66		~~- links.directory = links.writer.getDirectory();~~
67		~~- return links;~~
	92	+ protected FieldSelector makeSelector(String field){
	93	+ HashSet<String> onlySet = new HashSet<String>();
	94	+ onlySet.add(field);
	95	+ return new SetBasedFieldSelector(onlySet, new HashSet<String>());
68	96	}
69	97
70		~~- private static void initWriter(IndexWriter writer) {~~
71		~~- writer.setMergeFactor(20);~~
72		~~- writer.setMaxBufferedDocs(500);~~
73		~~- writer.setUseCompoundFile(true);~~
	98	+ private void initWriter(IndexWriter writer) {
	99	+ if(writer != null){
	100	+ writer.setMergeFactor(20);
	101	+ writer.setMaxBufferedDocs(500);
	102	+ writer.setUseCompoundFile(true);
	103	+ if(directory == null)
	104	+ directory = writer.getDirectory();
	105	+ }
74	106	}
75		-
	107	+
	108	+ /** Open the index path for updates */
	109	+ public static Links openForModification(IndexId iid) throws IOException{
	110	+ iid = iid.getLinks();
	111	+ String path = iid.getIndexPath();
	112	+ log.info("Using index at "+path);
	113	+ IndexWriter writer = WikiIndexModifier.openForWrite(path,false);
	114	+ return new Links(iid,path,writer);
	115	+ }
	116	+
	117	+ /** Open index at path for reading */
	118	+ public static Links openForRead(IndexId iid, String path) throws IOException {
	119	+ iid = iid.getLinks();
	120	+ log.info("Opening for read "+path);
	121	+ return new Links(iid,path,null);
	122	+ }
	123	+
	124	+ /** Create new in the import path */
76	125	public static Links createNew(IndexId iid) throws IOException{
77		~~- Links links = new Links(iid);~~
78		~~- links.path = iid.getTempPath();~~
79		~~- log.info("Making index at "+links.path);~~
80		~~- links.writer = WikiIndexModifier.openForWrite(links.path,true);~~
81		~~- links.reader = IndexReader.open(links.path);~~
82		~~- links.nsmap = Localization.getLocalizedNamespaces(links.langCode);~~
83		~~- links.interwiki = Localization.getInterwiki();~~
84		~~- links.state = State.FLUSHED;~~
85		~~- links.directory = links.writer.getDirectory();~~
	126	+ iid = iid.getLinks();
	127	+ String path = iid.getImportPath();
	128	+ log.info("Making index at "+path);
	129	+ IndexWriter writer = WikiIndexModifier.openForWrite(path,true);
	130	+ Links links = new Links(iid,path,writer);
86	131	return links;
87	132	}
88	133
	134	+ /** Create new index in memory (RAMDirectory) */
89	135	public static Links createNewInMemory(IndexId iid) throws IOException{
90		~~- Links links = new Links(iid);~~
91		~~- links.path = iid.getTempPath();~~
92		~~- log.info("Making index at "+links.path);~~
93		~~- links.writer = new IndexWriter(new RAMDirectory(),new SimpleAnalyzer(),true);~~
94		~~- links.reader = IndexReader.open(links.path);~~
95		~~- links.nsmap = Localization.getLocalizedNamespaces(links.langCode);~~
96		~~- links.interwiki = Localization.getInterwiki();~~
97		~~- links.state = State.FLUSHED;~~
98		~~- links.directory = links.writer.getDirectory();~~
	136	+ iid = iid.getLinks();
	137	+ log.info("Making index in memory");
	138	+ IndexWriter writer = new IndexWriter(new RAMDirectory(),new SimpleAnalyzer(),true);
	139	+ Links links = new Links(iid,null,writer);
99	140	return links;
100	141	}
101	142
—	—	@@ -105,23 +146,21 @@
106	147	}
107	148	}
108	149
	150	+ /** Add a custom namespace mapping */
109	151	public void addToNamespaceMap(String namespace, int index){
110	152	nsmap.put(namespace.toLowerCase(),index);
111	153	}
112	154
113		~~- /** Write all changes, call after batch-adding of titles and articles~~
	155	+ /** Write all changes, optimize/close everything
114	156	* @throws IOException */
115	157	public void flush() throws IOException{
116	158	// close & optimize
117		~~- reader.close();~~
	159	+ if(reader != null)
	160	+ reader.close();
118	161	if(writer != null){
119	162	writer.optimize();
120	163	writer.close();
121	164	}
122		~~- // reopen~~
123		~~- writer = new IndexWriter(directory, new SimpleAnalyzer(), false);~~
124		~~- initWriter(writer);~~
125		~~- reader = IndexReader.open(path);~~
126	165	state = State.FLUSHED;
127	166	}
128	167
—	—	@@ -130,41 +169,71 @@
131	170	* Can still read.
132	171	* @throws IOException
133	172	*/
134		~~- public void flushForRead() throws IOException{~~
	173	+ protected void flushForRead() throws IOException{
135	174	// close & optimize
136		~~- reader.close();~~
137		~~- writer.optimize();~~
138		~~- writer.close();~~
	175	+ if(reader != null)
	176	+ reader.close();
	177	+ if(writer != null){
	178	+ writer.optimize();
	179	+ writer.close();
	180	+ }
	181	+ log.debug("Opening index reader");
139	182	// reopen
140	183	reader = IndexReader.open(path);
141	184	writer = null;
142	185	state = State.READ;
143	186	}
144	187
145		~~- /** Add a title to enable proper link analysis when adding articles~~
146		~~- * @throws IOException */~~
147		~~- public void addTitle(Title t) throws IOException{~~
148		~~- Document doc = new Document();~~
149		~~- doc.add(new Field("namespace",Integer.toString(t.getNamespace()),Field.Store.YES,Field.Index.UN_TOKENIZED));~~
150		~~- doc.add(new Field("title",t.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED));~~
151		~~- doc.add(new Field("title_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));~~
152		~~- writer.addDocument(doc);~~
153		~~- state = State.MODIFIED_TITLES;~~
	188	+ /** Open the writer, and close the reader (if any) */
	189	+ protected void openForWrite() throws IOException{
	190	+ if(reader != null)
	191	+ reader.close();
	192	+ if(writer == null){
	193	+ if(directory == null)
	194	+ throw new RuntimeException("Opened for read, but trying to write");
	195	+ writer = new IndexWriter(directory,new SimpleAnalyzer(),false);
	196	+ initWriter(writer);
	197	+ reader = null;
	198	+ state = State.WRITE;
	199	+ }
154	200	}
155	201
	202	+ protected void ensureRead() throws IOException {
	203	+ if(state != State.READ)
	204	+ flushForRead();
	205	+ }
	206	+
	207	+ protected void ensureWrite() throws IOException {
	208	+ if(writer == null)
	209	+ openForWrite();
	210	+ }
	211	+
	212	+ /** Modify existing article links info */
	213	+ public void modifyArticleInfo(String text, Title t) throws IOException{
	214	+ ensureWrite();
	215	+ writer.deleteDocuments(new Term("article_key",t.getKey()));
	216	+ addArticleInfo(text,t);
	217	+ }
	218	+
156	219	/** Add links and other info from article
157	220	* @throws IOException */
158	221	public void addArticleInfo(String text, Title t) throws IOException{
159		~~- if(state == State.MODIFIED_TITLES)~~
160		~~- flush();~~
	222	+ ensureWrite();
161	223	Pattern linkPat = Pattern.compile("\\[\\[(.?)(\\\|(.?))?\\]\\]");
162	224	int namespace = t.getNamespace();
163	225	Matcher matcher = linkPat.matcher(text);
164	226	int ns; String title;
165	227	boolean escaped;
	228	+
166	229	HashSet<String> pagelinks = new HashSet<String>();
167		~~- HashSet<String> linkkeys = new HashSet<String>();~~
	230	+ // article link -> contexts
	231	+ HashMap<String,ArrayList<String>> contextMap = new HashMap<String,ArrayList<String>>();
168	232
	233	+ // use context only for namespace in default search
	234	+ boolean useContext = nsf.contains(t.getNamespace());
	235	+
	236	+ ContextParser cp = new ContextParser(text,imageLocalized,categoryLocalized,interwiki);
	237	+
169	238	Title redirect = Localization.getRedirectTitle(text,langCode);
170	239	String redirectsTo = null;
171	240	if(redirect != null){
—	—	@@ -172,9 +241,8 @@
173	242	} else {
174	243	while(matcher.find()){
175	244	String link = matcher.group(1);
176		~~- String anchor = matcher.group(2);~~
177		~~- if(anchor != null && anchor.length()>1 && anchor.substring(1).equalsIgnoreCase(title(link)))~~
178		~~- anchor = null; // anchor same as link text~~
	245	+ ContextParser.Context context = useContext? cp.getNext(matcher.start(1)) : null;
	246	+
179	247	int fragment = link.lastIndexOf('#');
180	248	if(fragment != -1)
181	249	link = link.substring(0,fragment);
—	—	@@ -204,156 +272,107 @@
205	273	}
206	274	if(ns == 0 && namespace!=0)
207	275	continue; // skip links from other namespaces into the main namespace
208		~~- String target = findTargetLink(ns,title);~~
	276	+ String target = findTargetLink(ns,title);
209	277	if(target != null){
210		~~- //System.out.println("Found "+link);~~
211		~~- linkkeys.add(target); // for outlink storage~~
212		~~- pagelinks.add(target+"\|"); // for backlinks~~
213		~~- if(anchor != null && !"\|".equals(anchor))~~
214		~~- pagelinks.add(target+anchor); // for efficient anchortext extraction~~
	278	+ int targetNs = Integer.parseInt(target.substring(0,target.indexOf(':')));
	279	+ pagelinks.add(target); // for outlink storage
	280	+ // register context of this link
	281	+ if(context != null && nsf.contains(targetNs)){
	282	+ ArrayList<String> ct = contextMap.get(target);
	283	+ if(ct==null){
	284	+ ct = new ArrayList<String>();
	285	+ contextMap.put(target,ct);
	286	+ }
	287	+ ct.add(context.get(text));
	288	+ }
215	289	}
216	290	}
217	291	}
218	292	// index article
219		~~- StringList sl = new StringList(pagelinks);~~
220		~~- StringList lk = new StringList(linkkeys);~~
	293	+ StringList lk = new StringList(pagelinks);
221	294	Analyzer an = new SplitAnalyzer();
222	295	Document doc = new Document();
223		~~- doc.add(new Field("namespace",t.getNamespaceAsString(),Field.Store.YES,Field.Index.UN_TOKENIZED));~~
224		~~- doc.add(new Field("title",t.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED));~~
225	296	doc.add(new Field("article_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
226	297	if(redirectsTo != null)
227		~~- doc.add(new Field("redirect",redirectsTo,Field.Store.YES,Field.Index.UN_TOKENIZED));~~
	298	+ doc.add(new Field("redirect",redirectsTo+"\|"+t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
228	299	else{
229		~~- doc.add(new Field("links",sl.toString(),Field.Store.NO,Field.Index.TOKENIZED));~~
230		~~- doc.add(new Field("links_stored",lk.toString(),Field.Store.YES,Field.Index.TOKENIZED));~~
	300	+ doc.add(new Field("links",lk.toString(),Field.Store.COMPRESS,Field.Index.TOKENIZED));
231	301	}
	302	+ if(contextMap.size() != 0){
	303	+ /*for(Entry<String,ArrayList<String>> e : contextMap.entrySet()){
	304	+ Document con = new Document();
	305	+ con.add(new Field("context_key",e.getKey()+"\|"+t.getKey(),Field.Store.NO,Field.Index.UN_TOKENIZED));
	306	+ con.add(new Field("context",new StringList(e.getValue()).toString(),Field.Store.COMPRESS,Field.Index.NO));
	307	+ writer.addDocument(con,an);
	308	+ }*/
	309	+ // serialize the java object (contextMap) into context field
	310	+ //ByteArrayOutputStream ba = new ByteArrayOutputStream();
	311	+ //ObjectOutputStream ob = new ObjectOutputStream(ba);
	312	+ //ob.writeObject(contextMap);
	313	+ //doc.add(new Field("context",ba.toByteArray(),Field.Store.COMPRESS));
	314	+ doc.add(new Field("context",new StringMap(contextMap).serialize(),Field.Store.COMPRESS));
	315	+ }
232	316
233	317	writer.addDocument(doc,an);
234		~~- state = State.MODIFIED_ARTICLES;~~
	318	+ state = State.MODIFIED;
235	319	}
236		~~- public static HashSet<Character> separators = new HashSet<Character>();~~
237		~~- static{~~
238		~~- separators.add(' ');~~
239		~~- separators.add('\r');~~
240		~~- separators.add('\n');~~
241		~~- separators.add('\t');~~
242		~~- separators.add(':');~~
243		~~- separators.add('(');~~
244		~~- separators.add(')');~~
245		~~- separators.add('[');~~
246		~~- separators.add(']');~~
247		~~- separators.add('.');~~
248		~~- separators.add(',');~~
249		~~- separators.add(':');~~
250		~~- separators.add(';');~~
251		~~- separators.add('"');~~
252		~~- separators.add('+');~~
253		~~- separators.add('*');~~
254		~~- separators.add('!');~~
255		~~- separators.add('~');~~
256		~~- separators.add('$');~~
257		~~- separators.add('%');~~
258		~~- separators.add('^');~~
259		~~- separators.add('&');~~
260		~~- separators.add('_');~~
261		~~- separators.add('=');~~
262		~~- separators.add('\|');~~
263		~~- separators.add('\\');~~
264		~~- }~~
265	320
266		- /**
267		~~- * Find a sentance boundaries~~
268		- *
269		~~- * @param text - raw text~~
270		~~- * @param start - start index to search from~~
271		~~- * @param reverse - if true, will lookup in reverse~~
272		~~- * @param max - radius of search (if no boundary is found return last wordbreak)~~
273		~~- * @return~~
274		~~- */~~
275		~~- protected int findSentance(char[] text, int start, boolean reverse, int max){~~
276		~~- int inc = (reverse)? -1 : 1;~~
277		~~- int count = 0;~~
278		~~- int wordbreak = start;~~
279		~~- int i = start;~~
280		~~- for(;i>0 && i<text.length;i+=inc){~~
281		~~- char c = text[i];~~
282		~~- if(c == '.')~~
283		~~- return i;~~
284		~~- else if(c == '*' && ((i>1 && text[i-1]=='\n') \|\| i==0))~~
285		~~- return i;~~
286		~~- else if(separators.contains(c))~~
287		~~- wordbreak = i;~~
288		~~- if(count >= max)~~
289		~~- return wordbreak; // more than max chars away, return the latest wordbreak~~
290		~~- count ++;~~
291		~~- }~~
292		~~- return i;~~
293		~~- }~~
294		-
295		~~- /** Find surrounding for a link - extract sentances, list items .... */~~
296		~~- protected String findContext(char[] text, int start, int end){~~
297		~~- // TODO: implement~~
298		~~- return null;~~
299		~~- }~~
300		-
301	321	/** Find the target key to title (ns:title) to which the links is pointing to
302	322	* @throws IOException */
303	323	protected String findTargetLink(int ns, String title) throws IOException{
304	324	String key;
305	325	if(title.length() == 0)
306	326	return null;
307		~~- // try exact match~~
308		~~- key = ns+":"+title;~~
309		~~- if(reader.docFreq(new Term("title_key",key)) != 0)~~
310		~~- return key;~~
311		~~- // try lowercase~~
312		~~- key = ns+":"+title.toLowerCase();~~
313		~~- if(reader.docFreq(new Term("title_key",key)) != 0)~~
314		~~- return key;~~
315		~~- // try lowercase with first letter upper case~~
	327	+
	328	+ // first letter uppercase
316	329	if(title.length()==1)
317	330	key = ns+":"+title.toUpperCase();
318	331	else
319		~~- key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase();~~
320		~~- if(reader.docFreq(new Term("title_key",key)) != 0)~~
321		~~- return key;~~
322		~~- // try title case~~
323		~~- key = ns+":"+WordUtils.capitalize(title);~~
324		~~- if(reader.docFreq(new Term("title_key",key)) != 0)~~
325		~~- return key;~~
326		~~- // try upper case~~
327		~~- key = ns+":"+title.toUpperCase();~~
328		~~- if(reader.docFreq(new Term("title_key",key)) != 0)~~
329		~~- return key;~~
330		~~- // try capitalizing at word breaks~~
331		~~- key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});~~
332		~~- if(reader.docFreq(new Term("title_key",key)) != 0)~~
333		~~- return key;~~
334		-
335		~~- return null;~~
	332	+ key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1);
	333	+ return key; // index everything, even if the target article doesn't exist
336	334	}
337	335
338	336	/** Get number of backlinks to this title */
339	337	public int getNumInLinks(String key) throws IOException{
340		~~- return reader.docFreq(new Term("links",key+"\|"));~~
	338	+ ensureRead();
	339	+ /*String cacheKey = "getNumInLinks:"+key;
	340	+ Object ref = refCache.get(cacheKey);
	341	+ if(ref != null)
	342	+ return (Integer) ref;
	343	+ else{ */
	344	+ int r = reader.docFreq(new Term("links",key));
	345	+ //refCache.put(cacheKey,r);
	346	+ return r;
	347	+ //}
341	348	}
342	349
	350	+ @Deprecated
343	351	/** Get all article titles that redirect to given title */
344		~~- public ArrayList<String> getRedirectsTo(String key) throws IOException{~~
	352	+ public ArrayList<String> getRedirectsToOld(String key) throws IOException{
	353	+ ensureRead();
345	354	ArrayList<String> ret = new ArrayList<String>();
346	355	TermDocs td = reader.termDocs(new Term("redirect",key));
347	356	while(td.next()){
348		~~- ret.add(reader.document(td.doc()).get("article_key"));~~
	357	+ ret.add(reader.document(td.doc(),keyOnly).get("article_key"));
349	358	}
350	359	return ret;
351	360	}
352	361
353		~~- protected void ensureRead() throws IOException {~~
354		~~- if(state != State.READ)~~
355		~~- flushForRead();~~
	362	+ /** Get all article titles that redirect to given title */
	363	+ public ArrayList<String> getRedirectsTo(String key) throws IOException{
	364	+ ensureRead();
	365	+ ArrayList<String> ret = new ArrayList<String>();
	366	+ String prefix = key+"\|";
	367	+ TermEnum te = reader.terms(new Term("redirect",prefix));
	368	+ while(te.next()){
	369	+ String t = te.term().text();
	370	+ if(t.startsWith(prefix)){
	371	+ ret.add(t.substring(t.indexOf('\|')+1));
	372	+ } else
	373	+ break;
	374	+ }
	375	+ return ret;
356	376	}
357		-
358	377
359	378	/** If an article is a redirect
360	379	* @throws IOException */
—	—	@@ -361,75 +380,43 @@
362	381	ensureRead();
363	382	TermDocs td = reader.termDocs(new Term("article_key",key));
364	383	if(td.next()){
365		~~- if(reader.document(td.doc()).get("redirect")!=null)~~
	384	+ if(reader.document(td.doc(),redirectOnly).get("redirect")!=null)
366	385	return true;
367	386	}
368	387	return false;
369	388	}
370		-
	389	+
	390	+ @Deprecated
371	391	/** If article is redirect, get target, else null */
372		~~- public String getRedirectTarget(String key) throws IOException{~~
	392	+ public String getRedirectTargetOld(String key) throws IOException{
373	393	ensureRead();
374	394	TermDocs td = reader.termDocs(new Term("article_key",key));
375	395	if(td.next()){
376		~~- return reader.document(td.doc()).get("redirect");~~
	396	+ return reader.document(td.doc(),redirectOnly).get("redirect");
377	397	}
378	398	return null;
379	399	}
380	400
381		~~- /** Get only anchors without frequency */~~
382		~~- public ArrayList<String> getAnchors(String key) throws IOException{~~
	401	+ /** If article is redirect, get target, else null */
	402	+ public String getRedirectTarget(String key) throws IOException{
383	403	ensureRead();
384		~~- ArrayList<String> ret = new ArrayList<String>();~~
385		~~- TermEnum te = reader.terms(new Term("links",key+"\|"));~~
386		~~- while(te.next()){~~
387		~~- String t = te.term().text();~~
388		~~- if(!t.startsWith(key) \|\| !te.term().field().equals("links"))~~
389		~~- break;~~
390		~~- ret.add(t.substring(key.length()+1));~~
	404	+ TermDocs td = reader.termDocs(new Term("article_key",key));
	405	+ if(td.next()){
	406	+ String t = reader.document(td.doc(),redirectOnly).get("redirect");
	407	+ return t.substring(t.indexOf('\|')+1);
391	408	}
392		~~- return ret;~~
	409	+ return null;
393	410	}
394		-
395		~~- /** Get title part of the key (ns:title) */~~
396		~~- private String title(String key) {~~
397		~~- return key.substring(key.indexOf(':')+1);~~
398		~~- }~~
399	411
400		~~- /** Get anchor texts for given title~~
401		~~- * @throws IOException */~~
402		~~- public ArrayList<AnchorText> getAnchorText(String key) throws IOException{~~
403		~~- ensureRead();~~
404		~~- ArrayList<AnchorText> ret = new ArrayList<AnchorText>();~~
405		~~- TermEnum te = reader.terms(new Term("links",key+"\|"));~~
406		~~- while(te.next()){~~
407		~~- if(!te.term().text().startsWith(key) \|\| !te.term().field().equals("links"))~~
408		~~- break;~~
409		~~- ret.add(new AnchorText(te.term().text().substring(key.length()),te.docFreq()));~~
410		~~- }~~
411		~~- return ret;~~
412		~~- }~~
413	412
414		~~- static public class AnchorText {~~
415		~~- public String text; / ns:title /~~
416		~~- public int freq;~~
417		~~- public AnchorText(String text, int freq) {~~
418		~~- this.text = text;~~
419		~~- this.freq = freq;~~
420		~~- }~~
421		~~- }~~
422		-
423		~~- /** Get all article titles linking to given title~~
424		~~- * @throws IOException */~~
425		~~- public ArrayList<String> getInLinks(String key, HashMap<Integer,String> keyCache) throws IOException{~~
	413	+ /** Return the namespace of the redirect taget (if any) */
	414	+ public int getRedirectTargetNamespace(String key) throws IOException{
426	415	ensureRead();
427		~~- ArrayList<String> ret = new ArrayList<String>();~~
428		~~- TermDocs td = reader.termDocs(new Term("links",key+"\|"));~~
429		~~- while(td.next()){~~
430		~~- ret.add(keyCache.get(td.doc()));~~
431		~~- //ret.add(reader.document(td.doc()).get("article_key"));~~
	416	+ String t = getRedirectTarget(key);
	417	+ if(t != null){
	418	+ return Integer.parseInt(t.substring(t.indexOf('\|')+1,t.indexOf(':',t.indexOf('\|'))));
432	419	}
433		~~- return ret;~~
	420	+ return 0;
434	421	}
435	422
436	423	/** Get all article titles linking to given title
—	—	@@ -437,9 +424,11 @@
438	425	public ArrayList<CompactArticleLinks> getInLinks(CompactArticleLinks key, HashMap<Integer,CompactArticleLinks> keyCache) throws IOException{
439	426	ensureRead();
440	427	ArrayList<CompactArticleLinks> ret = new ArrayList<CompactArticleLinks>();
441		~~- TermDocs td = reader.termDocs(new Term("links",key+"\|"));~~
	428	+ TermDocs td = reader.termDocs(new Term("links",key.toString()));
442	429	while(td.next()){
443		~~- ret.add(keyCache.get(td.doc()));~~
	430	+ CompactArticleLinks cs = keyCache.get(td.doc());
	431	+ if(cs != null)
	432	+ ret.add(cs);
444	433	}
445	434	return ret;
446	435	}
—	—	@@ -449,9 +438,9 @@
450	439	public ArrayList<String> getInLinks(String key) throws IOException{
451	440	ensureRead();
452	441	ArrayList<String> ret = new ArrayList<String>();
453		~~- TermDocs td = reader.termDocs(new Term("links",key+"\|"));~~
	442	+ TermDocs td = reader.termDocs(new Term("links",key));
454	443	while(td.next()){
455		~~- ret.add(reader.document(td.doc()).get("article_key"));~~
	444	+ ret.add(reader.document(td.doc(),keyOnly).get("article_key"));
456	445	}
457	446	return ret;
458	447	}
—	—	@@ -461,60 +450,77 @@
462	451	ensureRead();
463	452	TermDocs td = reader.termDocs(new Term("article_key",key));
464	453	if(td.next()){
465		~~- return new StringList(reader.document(td.doc()).get("links_stored"));~~
	454	+ return new StringList(reader.document(td.doc(),linksOnly).get("links"));
466	455	}
467	456	return null;
468	457	}
469	458
470		~~- public Dictionary getKeys() throws IOException{~~
	459	+ /** Get all contexts in which article <i>to<i/> is linked from <i>from</i>.
	460	+ * Will return null if there is no context, or link is invalid.
	461	+ * @throws ClassNotFoundException */
	462	+ @SuppressWarnings("unchecked")
	463	+ public ArrayList<String> getContext(String from, String to) throws IOException {
471	464	ensureRead();
472		~~- return new LuceneDictionary(reader,"article_key");~~
473		~~- }~~
474		~~- @Deprecated~~
475		~~- protected void cacheInLinks() throws IOException{~~
476		~~- if(state != State.FLUSHED)~~
477		~~- flush();~~
478		~~- log.info("Caching in-links");~~
479		~~- int count = 0;~~
480		~~- // docid -> key~~
481		~~- HashMap<Integer,String> keyCache = new HashMap<Integer,String>();~~
482		~~- Dictionary dict = new LuceneDictionary(reader,"article_key");~~
483		~~- Word w;~~
484		~~- // build key cache~~
485		~~- while((w = dict.next()) != null){~~
486		~~- String key = w.getWord();~~
487		~~- TermDocs td = reader.termDocs(new Term("article_key",key));~~
488		~~- if(td.next()){~~
489		~~- keyCache.put(td.doc(),key);~~
490		~~- } else~~
491		~~- log.error("Cannot find article for key "+key);~~
	465	+ String cacheKey = "getContext:"+from;
	466	+ //Element fromCache = cache.get(cacheKey);
	467	+ Object fromCache = cache.get(cacheKey);
	468	+ if(fromCache != null){
	469	+ //HashMap<String, ArrayList<String>> map = (HashMap<String, ArrayList<String>>) fromCache.getObjectValue();
	470	+ //HashMap<String, ArrayList<String>> map = (HashMap<String, ArrayList<String>>) fromCache;
	471	+ StringMap map = (StringMap) fromCache;
	472	+ return map.get(to);
492	473	}
493		-
494		~~- // get inlinks~~
495		~~- for(String key : keyCache.values()){~~
496		~~- ArrayList<String> in = getInLinks(key,keyCache);~~
497		~~- Document doc = new Document();~~
498		~~- doc.add(new Field("inlinks_key",key,Field.Store.YES,Field.Index.UN_TOKENIZED));~~
499		~~- doc.add(new Field("inlinks",new StringList(in).toString(),Field.Store.YES,Field.Index.UN_TOKENIZED));~~
500		~~- writer.addDocument(doc);~~
501		~~- count ++;~~
502		~~- if(count % 1000 == 0){~~
503		~~- System.out.println("Cached inlinks for "+count);~~
	474	+ TermDocs td = reader.termDocs(new Term("article_key",from));
	475	+ if(td.next()){
	476	+ byte[] serialized = reader.document(td.doc(),contextOnly).getBinaryValue("context");
	477	+ if(serialized == null)
	478	+ return null;
	479	+ StringMap map = new StringMap(serialized);
	480	+ try {
	481	+ //ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(serialized));
	482	+ //HashMap<String, ArrayList<String>> map;
	483	+ //map = (HashMap<String, ArrayList<String>>) in.readObject();
	484	+ // cache it !
	485	+ //cache.put(new Element(cacheKey,map));
	486	+ if(from.equals("0:1910") && to.equals("0:April")){
	487	+ int b =0;
	488	+ b++;
	489	+ }
	490	+ cache.put(cacheKey,map);
	491	+ return map.get(to);
	492	+ /* } catch (ClassNotFoundException e) {
	493	+ log.error("For getContext("+from+","+to+") got class not found exception: "+e.getMessage());
	494	+ e.printStackTrace(); // shouldn't happen! */
	495	+ } catch(Exception e){
	496	+ e.printStackTrace();
504	497	}
	498	+
505	499	}
	500	+
	501	+ return null;
506	502	}
507	503
508		~~- /** Get all article titles linking to given title (from inlinks cache)~~
509		~~- * @throws IOException */~~
510		~~- public Collection<String> getInLinksFromCache(String key) throws IOException{~~
511		~~- ensureRead();~~
512		~~- TermDocs td = reader.termDocs(new Term("inlinks_key",key));~~
513		~~- while(td.next()){~~
514		~~- return new StringList(reader.document(td.doc()).get("inlinks")).toCollection();~~
	504	+ /** Get all contexts in which article <i>to<i/> is linked from <i>from</i>.
	505	+ * Will return null if there is no context, or link is invalid.
	506	+ * @throws ClassNotFoundException */
	507	+ @SuppressWarnings("unchecked")
	508	+ public Collection<String> getContextOld(String from, String to) throws IOException {
	509	+ ensureRead();
	510	+
	511	+ TermDocs td = reader.termDocs(new Term("context_key",to+"\|"+from));
	512	+ if(td.next()){
	513	+ return new StringList(reader.document(td.doc()).get("context")).toCollection();
515	514	}
516		~~- return new ArrayList<String>();~~
	515	+
	516	+ return null;
517	517	}
518		-
	518	+
	519	+ /** Get a dictionary of all article keys (ns:title) in this index */
	520	+ public Dictionary getKeys() throws IOException{
	521	+ ensureRead();
	522	+ return new LuceneDictionary(reader,"article_key");
	523	+ }
	524	+
519	525	public Integer getDocId(String key) throws IOException {
520	526	TermDocs td = reader.termDocs(new Term("article_key",key));
521	527	if(td.next()){
—	—	@@ -530,7 +536,18 @@
531	537	if(reader != null)
532	538	reader.close();
533	539	if(directory != null)
534		~~- directory.close();~~
535		-
	540	+ directory.close();
536	541	}
	542	+
	543	+ public ObjectCache getCache() {
	544	+ return cache;
	545	+ }
	546	+
	547	+ /*public ObjectCache getRefCache() {
	548	+ return refCache;
	549	+ } */
	550	+
	551	+
	552	+
	553	+
537	554	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/ContextParser.java
—	—	@@ -0,0 +1,272 @@
	2	+package org.wikimedia.lsearch.ranks;
	3	+
	4	+import java.util.ArrayList;
	5	+import java.util.HashSet;
	6	+import java.util.Hashtable;
	7	+
	8	+import org.wikimedia.lsearch.util.Localization;
	9	+
	10	+/**
	11	+ * Parse wiki-text into sentences. Each sentence will provide a
	12	+ * context for links within it.
	13	+ *
	14	+ * @author rainman
	15	+ *
	16	+ */
	17	+public class ContextParser {
	18	+ protected char[] text;
	19	+ protected int len;
	20	+ protected HashSet<String> imageLocalized = null;
	21	+ protected HashSet<String> categoryLocalized = null;
	22	+ protected HashSet<String> interwiki = null;
	23	+
	24	+ protected ArrayList<Context> contexts = null;
	25	+ protected int conIn = 0;
	26	+
	27	+ public static class Context {
	28	+ int start;
	29	+ int end;
	30	+ String context = null;
	31	+ public Context(int start, int end) {
	32	+ this.start = start;
	33	+ this.end = end;
	34	+ }
	35	+
	36	+ public String get(String text){
	37	+ if(context == null)
	38	+ context = text.substring(start,end);
	39	+ return context;
	40	+ }
	41	+
	42	+ }
	43	+
	44	+ public ContextParser(String text, HashSet<String> imageLocalized, HashSet<String> categoryLocalized, HashSet<String> interwiki){
	45	+ this.text = text.toCharArray();
	46	+ this.len = this.text.length;
	47	+ this.imageLocalized = imageLocalized;
	48	+ this.categoryLocalized = categoryLocalized;
	49	+ this.interwiki = interwiki;
	50	+ parse();
	51	+ }
	52	+
	53	+ /** Get indexes of boundaries of contexts (usually different sentences) */
	54	+ public ArrayList<Context> getContexts(){
	55	+ return contexts;
	56	+ }
	57	+
	58	+ /** Get context by index in text, function should be called for incrementaly larger index */
	59	+ public Context getNext(int index){
	60	+ if(conIn >= contexts.size())
	61	+ return null;
	62	+ Context c = contexts.get(conIn);
	63	+ if(c.start > index)
	64	+ return null;
	65	+ else{
	66	+ for(;conIn<contexts.size();conIn++){
	67	+ c = contexts.get(conIn);
	68	+ if(c.start <= index && index < c.end)
	69	+ return c;
	70	+ if(c.start > index)
	71	+ return null; // no context for this index
	72	+ }
	73	+ }
	74	+ return null;
	75	+ }
	76	+
	77	+ /** fetch up to 128 chars of prefix */
	78	+ protected String fetchPrefix(int in){
	79	+ int count = 0;
	80	+ for(int i=in;i<len;i++,count++){
	81	+ if(count >= 128)
	82	+ return null;
	83	+ if(text[i] == ':'){
	84	+ return new String(text,in,count);
	85	+ }
	86	+ }
	87	+ return null;
	88	+ }
	89	+
	90	+ protected void parse(){
	91	+ if(contexts != null)
	92	+ return;
	93	+ contexts = new ArrayList<Context>();
	94	+ int cur = 0;
	95	+ char c;
	96	+ boolean seenLetter = false;
	97	+ int topLinkLevel = 0;
	98	+ boolean inQuotes = false;
	99	+ int start = 0;
	100	+ for(;cur<len;cur++){
	101	+ c = text[cur];
	102	+ if(!seenLetter && Character.isLetterOrDigit(c))
	103	+ seenLetter = true;
	104	+ switch(c){
	105	+ case '[':
	106	+ if(cur+2>=len)
	107	+ continue; // EOF
	108	+ if(text[cur+1]=='['){
	109	+ boolean valid = false;
	110	+ int closingInx = -1;
	111	+ // seek to see if this is valid link opening
	112	+ for(int i=cur+2;i<len && i<cur+512;i++){
	113	+ if(text[i]=='[' && i+1<len && text[i+1]=='[')
	114	+ break; // bad internal link
	115	+ if(text[i]==']' && i+1<len && text[i+1]==']'){
	116	+ topLinkLevel++; // ok, valid internal link
	117	+ closingInx = i+2;
	118	+ valid = true;
	119	+ break;
	120	+ }
	121	+
	122	+ }
	123	+ // begin of links
	124	+ String prefix = fetchPrefix(cur+2);
	125	+ if(prefix != null && isImage(prefix)){
	126	+ // take full image caption as one context
	127	+ int lastPipe = cur + 2 + prefix.length();
	128	+ int linkLevel = 0;
	129	+ int imageEnd = -1;
	130	+ for(int i=lastPipe;i<len;i++){
	131	+ if(text[i]=='\|')
	132	+ lastPipe = i;
	133	+ // internal link begin
	134	+ if(text[i]=='[' && i+1<len && text[i+1]=='[')
	135	+ linkLevel++;
	136	+ // internal link end
	137	+ if(text[i]==']' && i+1<len && text[i+1]==']'){
	138	+ if(linkLevel == 0){
	139	+ imageEnd = i+1;
	140	+ break;
	141	+ } else if(linkLevel != 0)
	142	+ linkLevel--;
	143	+ }
	144	+ }
	145	+ // add everything up to image as one context
	146	+ // and image caption as second context
	147	+ if(imageEnd != -1){
	148	+ contexts.add(new Context(lastPipe+1,imageEnd-2));
	149	+ start = imageEnd+1;
	150	+ cur = imageEnd;
	151	+ }
	152	+ } else if(valid && prefix != null && (isCategory(prefix) \|\| isInterwiki(prefix))){
	153	+ // skip categories
	154	+ if(seenLetter)
	155	+ contexts.add(new Context(start,cur));
	156	+ start = cur;
	157	+ cur = closingInx;
	158	+ }
	159	+ }
	160	+ break;
	161	+ case 'h': case 'f':
	162	+ // check simple http/ftp links
	163	+ if(checkPrefix(cur,"http://") \|\| checkPrefix(cur,"ftp://")){
	164	+ if(seenLetter && cur-start>2)
	165	+ contexts.add(new Context(start,cur-1));
	166	+ for(;cur<len;cur++){
	167	+ if(text[cur]==' ' \|\| text[cur]==']'){ // seek to after link
	168	+ start = cur+1;
	169	+ seenLetter = false;
	170	+ break;
	171	+ }
	172	+ }
	173	+ }
	174	+ break;
	175	+ case '<':
	176	+ if(checkPrefix(cur,"<tr>") \|\| checkPrefix(cur,"</tr>")){
	177	+ if(seenLetter)
	178	+ contexts.add(new Context(start,cur-1));
	179	+ start = cur + 4;
	180	+ }
	181	+ break;
	182	+ case ']':
	183	+ if(cur+2>=len)
	184	+ continue; // EOF
	185	+ if(text[cur+1]==']' && topLinkLevel!=0){
	186	+ topLinkLevel--;
	187	+ }
	188	+ break;
	189	+ case '"':
	190	+ // numbers like 6'5"
	191	+ if(cur>0 && Character.isDigit(text[cur-1]))
	192	+ break;
	193	+ inQuotes = !inQuotes;
	194	+ break;
	195	+ case '=':
	196	+ case '!':
	197	+ case '?':
	198	+ case '{':
	199	+ case '}':
	200	+ case '*':
	201	+ case '#':
	202	+ case '\|':
	203	+ case '.':
	204	+ case '\n':
	205	+ // whole quote and link text is context
	206	+ //if(inQuotes \|\| topLinkLevel!=0)
	207	+ // break;
	208	+ // only double == is separator (as in headings)
	209	+ if(c == '=' && !(cur+1<len && text[cur+1]=='='))
	210	+ break;
	211	+ // \| is separator in tables, etc.. but not in link syntax like [[x\|y]]
	212	+ if(c == '\|' && topLinkLevel != 0 && (cur+1<len && text[cur+1]!='-'))
	213	+ break;
	214	+ // dot/comma between numbers
	215	+ if((c == '.' \|\| c==',') && (cur>0 && Character.isDigit(text[cur-1]) && cur+1<len && Character.isDigit(text[cur+1])))
	216	+ break;
	217	+ // proceed only if this is not paragraph brake (i.e. \n\n)
	218	+ if(c == '\n' && !(cur+1<len && (text[cur+1]=='\n' \|\| text[cur+1]==':')))
	219	+ break;
	220	+
	221	+ if(seenLetter){
	222	+ contexts.add(new Context(start,cur));
	223	+ start = cur + 1;
	224	+ seenLetter = false;
	225	+ }
	226	+ break;
	227	+ }
	228	+ }
	229	+ if(seenLetter)
	230	+ contexts.add(new Context(start,len));
	231	+ }
	232	+
	233	+ /** check text from cur position */
	234	+ private boolean checkPrefix(int cur, String prefix) {
	235	+ if(cur + prefix.length() < len){
	236	+ for(int i=0;i<prefix.length();i++){
	237	+ if(text[cur+i] != prefix.charAt(i))
	238	+ return false;
	239	+ }
	240	+ return true;
	241	+ }
	242	+ return false;
	243	+ }
	244	+
	245	+ /** Check if this is an "image" keyword using localization */
	246	+ private final boolean isImage(String prefix){
	247	+ prefix = prefix.toLowerCase();
	248	+ if(prefix.equals("image"))
	249	+ return true;
	250	+ if(imageLocalized!=null && imageLocalized.contains(prefix))
	251	+ return true;
	252	+ return false;
	253	+ }
	254	+
	255	+ private final boolean isCategory(String prefix){
	256	+ prefix = prefix.toLowerCase();
	257	+ if(prefix.equals("category"))
	258	+ return true;
	259	+ if(categoryLocalized!=null && categoryLocalized.contains(prefix))
	260	+ return true;
	261	+ return false;
	262	+ }
	263	+
	264	+ private final boolean isInterwiki(String prefix){
	265	+ if(interwiki!=null)
	266	+ return interwiki.contains(prefix);
	267	+ else
	268	+ return false;
	269	+ }
	270	+
	271	+
	272	+
	273	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RankBuilder.java
—	—	@@ -56,7 +56,6 @@
57	57	public static void main(String[] args) throws IOException {
58	58	String inputfile = null;
59	59	String dbname = null;
60		~~- boolean useExistingTemp = false;~~
61	60
62	61	System.out.println("MediaWiki Lucene search indexer - build rank info from xml dumps.\n");
63	62
—	—	@@ -64,15 +63,11 @@
65	64	log = Logger.getLogger(RankBuilder.class);
66	65
67	66	if(args.length < 2){
68		~~- System.out.println("Syntax: java RankBuilder [-t] <inputfile> <dbname>");~~
69		~~- System.out.println("Options:");~~
70		~~- System.out.println(" -t - use existing temporary ranking index");~~
	67	+ System.out.println("Syntax: java RankBuilder <inputfile> <dbname>");
71	68	return;
72	69	}
73	70	for(int i=0;i<args.length;i++){
74		~~- if(args[i].equals("-t"))~~
75		~~- useExistingTemp = true;~~
76		~~- else if(inputfile == null)~~
	71	+ if(inputfile == null)
77	72	inputfile = args[i];
78	73	else if(dbname == null)
79	74	dbname = args[i];
—	—	@@ -92,59 +87,22 @@
93	88	long start = System.currentTimeMillis();
94	89
95	90	// link info
96		~~- Links links = null;~~
97		~~- if(useExistingTemp)~~
98		~~- links = Links.openExisting(iid);~~
99		~~- else~~
100		~~- links = processLinks(inputfile,getTitles(inputfile,langCode,iid),langCode);~~
101		~~- //links.cacheInLinks();~~
102		~~- /*log.info("Creating ref count cache");~~
103		~~- HashMap<Integer,Integer> refCache = new HashMap<Integer,Integer>();~~
104		~~- HashMap<Integer,String> keyCache = new HashMap<Integer,String>();~~
105		~~- HashMap<String,Integer> docIdCache = new HashMap<String,Integer>();~~
106		~~- Word w; Dictionary d = links.getKeys();~~
107		~~- while((w = d.next()) != null){~~
108		~~- String key = w.getWord();~~
109		~~- int docid = links.getDocId(key);~~
110		~~- refCache.put(docid,links.getNumInLinks(key));~~
111		~~- keyCache.put(docid,key);~~
112		~~- docIdCache.put(key,docid);~~
	91	+ Links links = Links.createNew(iid);
	92	+ try{
	93	+ processLinks(inputfile,links,iid,langCode);
	94	+ } catch(IOException e){
	95	+ log.fatal("I/O error processing "+inputfile+" : "+e.getMessage());
	96	+ e.printStackTrace();
113	97	}
114		~~- log.info("Caching in/out links");~~
115		~~- HashMap<Integer,int[]> outLinkCache = new HashMap<Integer,int[]>();~~
116		~~- HashMap<Integer,int[]> inLinkCache = new HashMap<Integer,int[]>();~~
117		~~- // cache in/out links~~
118		~~- d = links.getKeys();~~
119		~~- while((w = d.next()) != null){~~
120		~~- String key = w.getWord();~~
121		~~- int docid = docIdCache.get(key);~~
122		~~- Collection<String> in = links.getInLinks(key,keyCache);~~
123		~~- int[] inset = new int[in.size()];~~
124		~~- int i=0;~~
125		~~- for(String k : in)~~
126		~~- inset[i++] = docIdCache.get(k);~~
127		~~- inLinkCache.put(docid,inset);~~
128		-
129		~~- Collection<String> out = links.getOutLinks(key).toCollection();~~
130		~~- int[] outset = new int[out.size()];~~
131		~~- i = 0;~~
132		~~- for(String k : out){~~
133		~~- outset[i++] = docIdCache.get(k);~~
134		~~- }~~
135		~~- outLinkCache.put(docid,outset);~~
136		~~- }~~
137		~~- storeLinkAnalysis(links,iid,docIdCache,keyCache,refCache,inLinkCache,outLinkCache); */~~
138		~~- storeLinkAnalysis(links,iid);~~
139		~~- //Storage store = Storage.getInstance();~~
140		~~- //store.storePageReferences(links.getAll(),dbname);~~
141		~~- //storeRelated(store,links,dbname);~~
142		-
	98	+
	99	+ IndexThread.makeIndexSnapshot(iid.getLinks(),iid.getLinks().getImportPath());
	100	+
143	101	long end = System.currentTimeMillis();
144	102
145	103	System.out.println("Finished generating ranks in "+formatTime(end-start));
146	104	}
147	105
148		- //public static void storeLinkAnalysis(Links links, IndexId iid, HashMap<String, Integer> docIdCache, HashMap<Integer, String> keyCache, HashMap<Integer, Integer> refCache, HashMap<Integer, int[]> inLinkCache, HashMap<Integer, int[]> outLinkCache) throws IOException{
	106	+ @Deprecated
149	107	public static void storeLinkAnalysis(Links links, IndexId iid) throws IOException{
150	108	log.info("Storing link analysis data");
151	109	LinkAnalysisStorage store = new LinkAnalysisStorage(iid);
—	—	@@ -154,7 +112,7 @@
155	113	String key = w.getWord();
156	114	int ref = links.getNumInLinks(key);
157	115	String redirectTarget = links.getRedirectTarget(key);
158		~~- ArrayList<String> anchor = links.getAnchors(key);~~
	116	+ ArrayList<String> anchor = null; //links.getAnchors(key);
159	117	ArrayList<Related> related = new ArrayList<Related>(); //FIXME: too slow getRelated(key,links,refCount,keyCache);
160	118	//ArrayList<Related> related = getRelated(key,links,docIdCache,keyCache,refCache,inLinkCache,outLinkCache);
161	119	ArrayList<String> redirect = links.getRedirectsTo(key);
—	—	@@ -164,53 +122,16 @@
165	123
166	124	}
167	125
168		~~- public static Links processLinks(String inputfile, Links links, String langCode) {~~
169		~~- log.info("Second pass, calculating article links...");~~
170		~~- InputStream input = null;~~
171		~~- // second pass - calculate page ranks~~
172		~~- try {~~
173		~~- input = Tools.openInputFile(inputfile);~~
174		~~- } catch (IOException e) {~~
175		~~- log.fatal("I/O error opening "+inputfile+" : "+e.getMessage());~~
176		~~- return null;~~
177		~~- }~~
	126	+ public static Links processLinks(String inputfile, Links links, IndexId iid, String langCode) throws IOException {
	127	+ log.info("Calculating article links...");
	128	+ InputStream input = Tools.openInputFile(inputfile);
178	129	// calculate ranks
179		~~- LinkReader rr = new LinkReader(links,langCode);~~
	130	+ LinkReader rr = new LinkReader(links,iid,langCode);
180	131	XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000));
181		~~- try {~~
182		~~- reader.readDump();~~
183		~~- links.flush();~~
184		~~- } catch (IOException e) {~~
185		~~- log.fatal("I/O error reading dump while calculating ranks for from "+inputfile+" : "+e.getMessage());~~
186		~~- return null;~~
187		~~- }~~
	132	+ reader.readDump();
	133	+ links.flush();
188	134	return links;
189	135	}
190		-
191		~~- public static Links getTitles(String inputfile,String langCode,IndexId iid) {~~
192		~~- log.info("First pass, getting a list of valid articles...");~~
193		~~- InputStream input = null;~~
194		~~- try {~~
195		~~- input = Tools.openInputFile(inputfile);~~
196		~~- } catch (IOException e) {~~
197		~~- log.fatal("I/O error opening "+inputfile+" : "+e.getMessage());~~
198		~~- return null;~~
199		~~- }~~
200		~~- try {~~
201		~~- // first pass, get titles~~
202		~~- TitleReader tr = new TitleReader(langCode,iid);~~
203		~~- XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));~~
204		~~- reader.readDump();~~
205		~~- input.close();~~
206		~~- Links links = tr.getLinks();~~
207		~~- links.flush();~~
208		~~- return links;~~
209		~~- } catch (IOException e) {~~
210		~~- log.fatal("I/O error reading dump while getting titles from "+inputfile+" : "+e.getMessage());~~
211		~~- return null;~~
212		~~- }~~
213		-
214		~~- }~~
215	136
216	137	/**
217	138	* Get related articles, sorted descending by score
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/StringMap.java
—	—	@@ -0,0 +1,198 @@
	2	+package org.wikimedia.lsearch.ranks;
	3	+
	4	+import java.io.ByteArrayInputStream;
	5	+import java.io.ByteArrayOutputStream;
	6	+import java.io.DataInputStream;
	7	+import java.io.DataOutputStream;
	8	+import java.io.EOFException;
	9	+import java.io.IOException;
	10	+import java.io.UnsupportedEncodingException;
	11	+import java.nio.ByteBuffer;
	12	+import java.util.ArrayList;
	13	+import java.util.Collections;
	14	+import java.util.Comparator;
	15	+import java.util.HashMap;
	16	+import java.util.HashSet;
	17	+import java.util.Map.Entry;
	18	+
	19	+public class StringMap {
	20	+ protected static final int BUFFER_SIZE = 300;
	21	+ protected char[] buf = new char[BUFFER_SIZE];
	22	+ protected int len = 0, pos = 0;
	23	+ protected HashMap<String,ArrayList<String>> map = null;
	24	+ protected HashMap<Integer,ArrayList<Integer>> hashMap = null;
	25	+ protected byte[] serialized = null;
	26	+ public static final char DELIMITER = '\0';
	27	+ protected final int INT_SIZE = Integer.SIZE / 8;
	28	+
	29	+ public StringMap(HashMap<String,ArrayList<String>> map){
	30	+ this.map = map;
	31	+ }
	32	+
	33	+ public StringMap(byte[] serialized) throws IOException{
	34	+ this.serialized = serialized;
	35	+ readHash();
	36	+ }
	37	+
	38	+ /** initialize the small hashmap at the beggining of the stream */
	39	+ private void readHash() throws IOException {
	40	+ hashMap = new HashMap<Integer,ArrayList<Integer>>();
	41	+ ByteArrayInputStream ba = new ByteArrayInputStream(serialized);
	42	+ DataInputStream di = new DataInputStream(ba);
	43	+ int size = di.readInt();
	44	+ for(int i=0;i<size;i++){
	45	+ int hash = di.readInt();
	46	+ ArrayList<Integer> pos = hashMap.get(hash);
	47	+ if(pos == null){
	48	+ pos = new ArrayList<Integer>();
	49	+ hashMap.put(hash,pos);
	50	+ }
	51	+ pos.add(di.readInt());
	52	+ }
	53	+ }
	54	+
	55	+ protected int encLen(String str) throws UnsupportedEncodingException{
	56	+ return str.getBytes("utf-8").length;
	57	+ }
	58	+
	59	+ public byte[] serialize() throws IOException{
	60	+ if(serialized != null)
	61	+ return serialized;
	62	+ // unique string, string -> index (within string segment)
	63	+ HashMap<String,Integer> strings = new HashMap<String,Integer>();
	64	+ // hash -> list of keys
	65	+ HashMap<Integer,ArrayList<String>> hashs = new HashMap<Integer,ArrayList<String>>();
	66	+ // contexts, key -> index of string (from strings)
	67	+ HashMap<String,ArrayList<Integer>> contexts = new HashMap<String,ArrayList<Integer>>();
	68	+ // keys in some order
	69	+ ArrayList<String> keys = new ArrayList<String>();
	70	+ keys.addAll(map.keySet());
	71	+ int offset = 0;
	72	+ for(String key : keys){
	73	+ // mapping hash -> keys
	74	+ int hash = key.hashCode();
	75	+ ArrayList<String> hk = hashs.get(hash);
	76	+ if(hk == null){
	77	+ hk = new ArrayList<String>();
	78	+ hashs.put(hash,hk);
	79	+ }
	80	+ hk.add(key);
	81	+ // contexts
	82	+ ArrayList<Integer> cc = new ArrayList<Integer>();
	83	+ contexts.put(key,cc);
	84	+ for(String s : map.get(key)){
	85	+ // identifier
	86	+ Integer i = strings.get(s);
	87	+ if(i == null){
	88	+ i = offset;
	89	+ strings.put(s,i);
	90	+ offset += encLen(s) + INT_SIZE;
	91	+ }
	92	+ cc.add(i);
	93	+ }
	94	+ }
	95	+ int keyOffset = INT_SIZE+2INT_SIZEmap.size();
	96	+ int stringOffset = keyOffset;
	97	+ // key -> offset
	98	+ HashMap<String,Integer> keyOffsets = new HashMap<String,Integer>();
	99	+ for(String key : keys){
	100	+ keyOffsets.put(key,stringOffset);
	101	+ stringOffset += INT_SIZE+encLen(key)+INT_SIZE+contexts.get(key).size()*INT_SIZE;
	102	+ }
	103	+ // serialize!
	104	+ ByteArrayOutputStream ba = new ByteArrayOutputStream();
	105	+ DataOutputStream ds = new DataOutputStream(ba);
	106	+ ds.writeInt(hashs.size());
	107	+ // write out the hashmap
	108	+ ArrayList<Entry<Integer,ArrayList<String>>> sortedHash = new ArrayList<Entry<Integer,ArrayList<String>>>();
	109	+ sortedHash.addAll(hashs.entrySet());
	110	+ Collections.sort(sortedHash,new Comparator<Entry<Integer,ArrayList<String>>>(){
	111	+ public int compare(Entry<Integer, ArrayList<String>> o1, Entry<Integer, ArrayList<String>> o2) {
	112	+ return o1.getKey() - o2.getKey();
	113	+ }
	114	+ });
	115	+ // write pairs [ hash] [ position of key ]
	116	+ for(Entry<Integer,ArrayList<String>> e : sortedHash){
	117	+ int hash = e.getKey();
	118	+ for(String key : e.getValue()){
	119	+ ds.writeInt(hash);
	120	+ ds.writeInt(keyOffsets.get(key));
	121	+ }
	122	+ }
	123	+ // write: [ key.length ] [ key ] [context1_pos] [context2_pos] ...
	124	+ for(String key : keys){
	125	+ byte[] b = key.getBytes("utf-8");
	126	+ ds.writeInt(b.length);
	127	+ ds.write(b);
	128	+ ArrayList<Integer> con = contexts.get(key);
	129	+ if(con == null \|\| con.size()==0)
	130	+ ds.writeInt(0);
	131	+ else{
	132	+ ds.writeInt(con.size());
	133	+ for(Integer index : con){
	134	+ ds.writeInt(stringOffset+index);
	135	+ }
	136	+ }
	137	+ }
	138	+ // write string as [size] [string]
	139	+ HashSet<String> written = new HashSet<String>();
	140	+ for(String key : keys){
	141	+ for(String c : map.get(key)){
	142	+ if(written.contains(c))
	143	+ continue;
	144	+ byte[] b = c.getBytes("utf-8");
	145	+ ds.writeInt(b.length);
	146	+ ds.write(b);
	147	+ written.add(c);
	148	+ }
	149	+ }
	150	+ serialized = ba.toByteArray();
	151	+ return serialized;
	152	+ }
	153	+
	154	+ private final int read(){
	155	+ return serialized[pos++] & 0xff;
	156	+ }
	157	+
	158	+ protected int readInt() throws IOException {
	159	+ int ch1 = read();
	160	+ int ch2 = read();
	161	+ int ch3 = read();
	162	+ int ch4 = read();
	163	+ if ((ch1 \| ch2 \| ch3 \| ch4) < 0)
	164	+ throw new EOFException();
	165	+ return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
	166	+ }
	167	+
	168	+ protected String readString() throws IOException{
	169	+ int len = readInt();
	170	+ int start = pos;
	171	+ pos+=len;
	172	+ return new String(serialized,start,len,"utf-8");
	173	+ }
	174	+
	175	+ /** Get an array of string for a key
	176	+ * @throws IOException */
	177	+ public ArrayList<String> get(String key) throws IOException{
	178	+ ArrayList<String> ret = new ArrayList<String>();
	179	+ if(!hashMap.containsKey(key.hashCode()))
	180	+ return ret;
	181	+ for(Integer p : hashMap.get(key.hashCode())){
	182	+ pos = p;
	183	+ String k = readString();
	184	+ if(key.equals(k)){
	185	+ // found key, read context
	186	+ int num = readInt();
	187	+ int[] strings = new int[num];
	188	+ for(int i=0;i<num;i++){
	189	+ strings[i] = readInt();
	190	+ }
	191	+ for(int strpos : strings){
	192	+ pos = strpos;
	193	+ ret.add(readString());
	194	+ }
	195	+ }
	196	+ }
	197	+ return ret;
	198	+ }
	199	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/ObjectCache.java
—	—	@@ -0,0 +1,67 @@
	2	+package org.wikimedia.lsearch.ranks;
	3	+
	4	+import java.util.HashMap;
	5	+import java.util.LinkedList;
	6	+import java.util.WeakHashMap;
	7	+
	8	+/**
	9	+ * Maintain a cache of objects. Cache is a simple FIFO cache of
	10	+ * constant size. Oldest entries get replaced by newer ones.
	11	+ *
	12	+ * @author rainman
	13	+ *
	14	+ */
	15	+public class ObjectCache {
	16	+ /** used to maintain FIFO cache of valid keys */
	17	+ protected String[] fifo;
	18	+ /** storage of objects */
	19	+ protected HashMap<String,Object> objs = new HashMap<String,Object>();
	20	+ protected int size, inx;
	21	+
	22	+ protected long hits = 0;
	23	+ protected long miss = 0;
	24	+
	25	+ protected int report = 0;
	26	+
	27	+ public ObjectCache(int size){
	28	+ this.size = size;
	29	+ this.fifo = new String[size];
	30	+ this.inx = 0;
	31	+ }
	32	+
	33	+ public void put(String key, Object obj){
	34	+ // add to FIFO queue only if not already in it
	35	+ if(!objs.containsKey(key)){
	36	+ if(inx >= size)
	37	+ inx = 0;
	38	+ String del = fifo[inx];
	39	+ if(del != null){
	40	+ //remove oldest from cache
	41	+ objs.remove(del);
	42	+ }
	43	+ fifo[inx] = key; // latest cached key
	44	+ inx++;
	45	+ }
	46	+ objs.put(key,obj);
	47	+ }
	48	+
	49	+ public Object get(String key){
	50	+ if(++report >= 5000){
	51	+ report = 0;
	52	+ System.out.println(getStats());
	53	+ }
	54	+ Object obj = objs.get(key);
	55	+ if(obj !=null )
	56	+ hits++;
	57	+ else
	58	+ miss++;
	59	+ return obj;
	60	+ }
	61	+
	62	+ public String getStats(){
	63	+ long total = hits+miss;
	64	+ return "HITS: "+hits+" ("+((float)hits100/total)+"%), MISS: "+miss+" ("+((float)miss100/total)+"%)";
	65	+ }
	66	+
	67	+
	68	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java
—	—	@@ -30,6 +30,7 @@
31	31	import org.apache.lucene.analysis.Analyzer;
32	32	import org.apache.lucene.analysis.TokenStream;
33	33	import org.wikimedia.lsearch.analyzers.WikiTokenizer;
	34	+import org.wikimedia.lsearch.config.IndexId;
34	35
35	36	/**
36	37	* @author Kate Turner
—	—	@@ -58,6 +59,6 @@
59	60	if(streams.get(fieldName) != null)
60	61	return streams.get(fieldName);
61	62
62		~~- return new AliasPorterStemFilter(new WikiTokenizer(text,"en",false));~~
	63	+ return new AliasPorterStemFilter(new WikiTokenizer(text,IndexId.get("enwiki"),false));
63	64	}
64	65	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
—	—	@@ -18,6 +18,7 @@
19	19	import org.wikimedia.lsearch.analyzers.WikiQueryParser.NamespacePolicy;
20	20	import org.wikimedia.lsearch.config.Configuration;
21	21	import org.wikimedia.lsearch.config.GlobalConfiguration;
	22	+import org.wikimedia.lsearch.config.IndexId;
22	23	import org.wikimedia.lsearch.index.WikiIndexModifier;
23	24	import org.wikimedia.lsearch.search.NamespaceFilter;
24	25
—	—	@@ -40,7 +41,7 @@
41	42	WikiQueryParser.KEYWORD_BOOST = 0.05f;
42	43	WikiQueryParser.ADD_TITLE_PHRASES = false;
43	44	WikiIndexModifier.ALT_TITLES = 3;
44		~~- FieldBuilder.BuilderSet bs = new FieldBuilder("").getBuilder();~~
	45	+ FieldBuilder.BuilderSet bs = new FieldBuilder(IndexId.get("enwiki")).getBuilder();
45	46	FieldNameFactory ff = new FieldNameFactory();
46	47	try{
47	48	WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),new SimpleAnalyzer(),bs,null);
—	—	@@ -126,9 +127,10 @@
127	128	// extraction of phrases
128	129	ArrayList<String> stopWords = new ArrayList<String>();
129	130	stopWords.add("the"); stopWords.add("who");
130		~~- stopWords.add("is"); stopWords.add("a");~~
131		~~- Analyzer analyzer = Analyzers.getSearcherAnalyzer("en");~~
132		~~- bs = new FieldBuilder("en").getBuilder();~~
	131	+ stopWords.add("is"); stopWords.add("a");
	132	+ IndexId enwiki = IndexId.get("enwiki");
	133	+ Analyzer analyzer = Analyzers.getSearcherAnalyzer(enwiki);
	134	+ bs = new FieldBuilder(enwiki).getBuilder();
133	135	parser = new WikiQueryParser(bs.getFields().title(),"0",analyzer,bs,NamespacePolicy.IGNORE,stopWords);
134	136	assertEquals("[how, do, you, do]",parser.extractWords(parser.parseRaw("how do you do")).toString());
135	137	assertEquals("[making, something, rest]",parser.extractWords(parser.parseRaw("(help:making something incategory:blah) OR (rest incategory:crest)")).toString());
—	—	@@ -230,8 +232,8 @@
231	233	// ==================================
232	234	// Tests with actual params :)
233	235	// ==================================
234		~~- analyzer = Analyzers.getSearcherAnalyzer("en");~~
235		~~- bs = new FieldBuilder("en").getBuilder();~~
	236	+ analyzer = Analyzers.getSearcherAnalyzer(enwiki);
	237	+ bs = new FieldBuilder(enwiki).getBuilder();
236	238	parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE);
237	239	WikiQueryParser.ADD_STEM_TITLE = false;
238	240	WikiQueryParser.STEM_TITLE_BOOST = 0;
—	—	@@ -354,8 +356,8 @@
355	357	assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(contents:godel contents:goedel)) (+title:something^2.0 +title:for^2.0 +(title:godel^2.0 title:goedel^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godel^6.0 alttitle1:goedel^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godel^6.0 alttitle2:goedel^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godel^6.0 alttitle3:goedel^6.0)))",q.toString());
356	358
357	359	// Backward compatiblity for complex filters
358		~~- analyzer = Analyzers.getSearcherAnalyzer("en");~~
359		~~- bs = new FieldBuilder("en").getBuilder();~~
	360	+ analyzer = Analyzers.getSearcherAnalyzer(enwiki);
	361	+ bs = new FieldBuilder(enwiki).getBuilder();
360	362	parser = new WikiQueryParser(bs.getFields().contents(),"0,1,4,12",analyzer,bs,NamespacePolicy.IGNORE);
361	363
362	364	q = parser.parseTwoPass("beans everyone",NamespacePolicy.REWRITE);
—	—	@@ -381,15 +383,15 @@
382	384	assertEquals("[(many,1,5), (more,7,11), (has,16,19), (some,23,27), (g,29,30)]",t.toString());
383	385
384	386	// German
385		~~- analyzer = Analyzers.getSearcherAnalyzer("de");~~
386		~~- bs = new FieldBuilder("de").getBuilder();~~
	387	+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("dewiki"));
	388	+ bs = new FieldBuilder(IndexId.get("dewiki")).getBuilder();
387	389	parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE);
388	390	q = parser.parseTwoPass("welche rolle spielen Mineralstoffe in der Ernährung?",NamespacePolicy.IGNORE);
389	391	assertEquals("(+(contents:welche contents:welch^0.5) +(contents:rolle contents:roll^0.5) +(contents:spielen contents:spiel^0.5) +(contents:mineralstoffe contents:mineralstoff^0.5) +contents:in +contents:der +(+(contents:ernahrung contents:ernahr^0.5) (contents:ernaehrung contents:ernaehr^0.5))) (+title:welche^2.0 +title:rolle^2.0 +title:spielen^2.0 +title:mineralstoffe^2.0 +title:in^2.0 +title:der^2.0 +(title:ernahrung^2.0 title:ernaehrung^2.0))",q.toString());
390	392
391	393	// CJK
392		~~- analyzer = Analyzers.getSearcherAnalyzer("ja");~~
393		~~- bs = new FieldBuilder("ja").getBuilder();~~
	394	+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("jawiki"));
	395	+ bs = new FieldBuilder(IndexId.get("jawiki")).getBuilder();
394	396	parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE);
395	397	q = parser.parseFourPass("うろパン",NamespacePolicy.IGNORE,false);
396	398	assertEquals("contents:\"うろろハハン\" title:\"うろろハハン\"^2.0 (alttitle1:\"うろろハハン\"^6.0 alttitle2:\"うろろハハン\"^6.0 alttitle3:\"うろろハハン\"^6.0)",q.toString());
—	—	@@ -402,8 +404,8 @@
403	405
404	406
405	407	// Malayalam
406		~~- analyzer = Analyzers.getSearcherAnalyzer("ml");~~
407		~~- bs = new FieldBuilder("ml").getBuilder();~~
	408	+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("mlwiki"));
	409	+ bs = new FieldBuilder(IndexId.get("mlwiki")).getBuilder();
408	410	parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE);
409	411	q = parser.parseFourPass("കൊറിയ ",NamespacePolicy.IGNORE,false);
410	412	assertEquals("contents:കറയ title:കറയ^2.0 (alttitle1:കറയ^6.0 alttitle2:കറയ^6.0 alttitle3:കറയ^6.0)",q.toString());
—	—	@@ -420,8 +422,8 @@
421	423	WikiQueryParser.STEM_TITLE_BOOST = 1;
422	424
423	425	// Localization tests
424		~~- analyzer = Analyzers.getSearcherAnalyzer("sr");~~
425		~~- bs = new FieldBuilder("sr").getBuilder();~~
	426	+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("srwiki"));
	427	+ bs = new FieldBuilder(IndexId.get("srwiki")).getBuilder();
426	428	parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE);
427	429
428	430	q = parser.parseTwoPass("all:добродошли на википедију",NamespacePolicy.IGNORE);
—	—	@@ -430,8 +432,8 @@
431	433	q = parser.parseTwoPass("all:dobrodošli na šđčćž",NamespacePolicy.IGNORE);
432	434	assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^3.0 +title:na^3.0 +title:sdjccz^3.0)",q.toString());
433	435
434		~~- analyzer = Analyzers.getSearcherAnalyzer("th");~~
435		~~- bs = new FieldBuilder("th").getBuilder();~~
	436	+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("thwiki"));
	437	+ bs = new FieldBuilder(IndexId.get("thwiki")).getBuilder();
436	438	parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE);
437	439
438	440	q = parser.parseTwoPass("ภาษาไทย",NamespacePolicy.IGNORE);
—	—	@@ -441,8 +443,8 @@
442	444	assertEquals("(+namespace:12 +(+contents:ภาษา +contents:ไทย)) (+namespace:12 +(+title:ภาษา^3.0 +title:ไทย^3.0))",q.toString());
443	445
444	446	// vietnamese
445		~~- analyzer = Analyzers.getSearcherAnalyzer("vi");~~
446		~~- bs = new FieldBuilder("vi").getBuilder();~~
	447	+ analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("viwiki"));
	448	+ bs = new FieldBuilder(IndexId.get("viwiki")).getBuilder();
447	449	parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE);
448	450
449	451	q = parser.parseTwoPass("Gánh nước đêm trăng",NamespacePolicy.IGNORE);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
—	—	@@ -11,11 +11,12 @@
12	12	import org.apache.lucene.analysis.Token;
13	13	import org.apache.lucene.analysis.TokenStream;
14	14	import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
	15	+import org.wikimedia.lsearch.config.IndexId;
15	16	import org.wikimedia.lsearch.index.WikiIndexModifier;
16	17
17	18	public class FastWikiTokenizerTest {
18	19	public static void displayTokensForParser(String text) {
19		~~- FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"en",false);~~
	20	+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false);
20	21	Token[] tokens = parser.parse().toArray(new Token[] {});
21	22	for (int i = 0; i < tokens.length; i++) {
22	23	Token token = tokens[i];
—	—	@@ -116,7 +117,7 @@
117	118	for(int i=0;i<2000;i++){
118	119	for(TestArticle article : articles){
119	120	String text = article.content;
120		~~- FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"en",false);~~
	121	+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false);
121	122	parser.parse();
122	123	}
123	124	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java
—	—	@@ -94,6 +94,8 @@
95	95	{"cource", "course"},
96	96	{"carolene products",""},
97	97	{"orvileWright","overnight"},
	98	+ {"livia tremor control","olivia tremor control"},
	99	+ {"ommmited","omitted"},
98	100
99	101	};
100	102
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
—	—	@@ -86,6 +86,10 @@
87	87	sendOutputLine("#no suggestion");
88	88	for(ResultSet rs : res.getResults()){
89	89	sendResultLine(rs.score, rs.namespace, rs.title);
	90	+ if(rs.getContext() != null){
	91	+ for(String c : rs.getContext())
	92	+ sendOutputLine("#context "+c);
	93	+ }
90	94	if(rs.getExplanation() != null)
91	95	sendOutputLine(rs.getExplanation().toString());
92	96	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/HttpHandler.java
—	—	@@ -41,7 +41,7 @@
42	42	protected String postData;
43	43
44	44	protected final int BUF_SIZE = 8192;
45		~~- protected final char[] outputBuffer = new char[BUF_SIZE];~~
	45	+ protected char[] outputBuffer = new char[BUF_SIZE];
46	46	protected int bufLength = 0;
47	47
48	48	protected int minorVersion; // the x in HTTP 1.x
—	—	@@ -227,9 +227,12 @@
228	228	log.debug(">>>"+sout);
229	229	// write to buffer instead directly to stream!
230	230	char[] s = (sout+"\r\n").toCharArray();
231		~~- if(bufLength + s.length >= BUF_SIZE)~~
	231	+ if(bufLength + s.length >= outputBuffer.length)
232	232	flushOutput();
233		~~- // FIXME: what if array is 2x larger than buffer?~~
	233	+ // extend buffer if needed
	234	+ if(s.length > bufLength){
	235	+ outputBuffer = new char[s.length*2];
	236	+ }
234	237	System.arraycopy(s,0,outputBuffer,bufLength,s.length);
235	238	bufLength+=s.length;
236	239	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java
—	—	@@ -46,7 +46,7 @@
47	47	Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,false);
48	48	NamespaceFilter nsDefault = new NamespaceFilter("0"); // default to main namespace
49	49	FieldBuilder.Case dCase = FieldBuilder.Case.IGNORE_CASE;
50		~~- FieldBuilder.BuilderSet bs = new FieldBuilder(global.getLanguage(iid.getDBname()),dCase).getBuilder(dCase);~~
	50	+ FieldBuilder.BuilderSet bs = new FieldBuilder(iid,dCase).getBuilder(dCase);
51	51	WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),nsDefault,analyzer,bs,WikiQueryParser.NamespacePolicy.IGNORE,null);
52	52	while(true){
53	53	System.out.print(">> ");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java
—	—	@@ -42,7 +42,7 @@
43	43	public CleanIndexWriter(IndexId iid) throws IOException{
44	44	GlobalConfiguration global = GlobalConfiguration.getInstance();
45	45	this.iid = iid;
46		~~- this.builder = new FieldBuilder("",FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK);~~
	46	+ this.builder = new FieldBuilder(iid,FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK);
47	47	this.langCode = global.getLanguage(iid.getDBname());
48	48	HashSet<String> stopWords = new HashSet<String>();
49	49	for(String w : StopWords.getStopWords(iid,langCode))
—	—	@@ -90,7 +90,7 @@
91	91	if(!WikiIndexModifier.checkAddPreconditions(a,langCode))
92	92	return; // don't add if preconditions are not met
93	93
94		~~- Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid);~~
	94	+ Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid,null);
95	95	Document doc = (Document) ret[0];
96	96	Analyzer analyzer = (Analyzer) ret[1];
97	97	try {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java
—	—	@@ -176,13 +176,13 @@
177	177	}
178	178
179	179	/**
180		~~- * Register a title in the index, without tokenization, just lowercase.~~
	180	+ * Register a title in the index, without tokenization, strip of accents and such.
181	181	*
182	182	* @param title
183	183	*/
184	184	public void addTitle(String title){
185	185	Document doc = new Document();
186		~~- doc.add(new Field("title", title.toLowerCase(), Field.Store.NO, Field.Index.UN_TOKENIZED));~~
	186	+ doc.add(new Field("title", FastWikiTokenizerEngine.stipTitle(title.toLowerCase()), Field.Store.NO, Field.Index.UN_TOKENIZED));
187	187	ngramWriter.addDocument(doc);
188	188	}
189	189	/**
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/CompactRelated.java
—	—	@@ -36,7 +36,7 @@
37	37	this.title = title;
38	38	}
39	39	public String serialize(){
40		~~- return score+" "+relates;~~
	40	+ return ((float)score)+" "+relates;
41	41	}
42	42
43	43	public static ArrayList<String> convertToStringList(Collection<CompactRelated> rel){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedBuilder.java
—	—	@@ -18,11 +18,14 @@
19	19	import org.mediawiki.dumper.ProgressFilter;
20	20	import org.mediawiki.dumper.Tools;
21	21	import org.mediawiki.importer.XmlDumpReader;
	22	+import org.wikimedia.lsearch.beans.Title;
22	23	import org.wikimedia.lsearch.config.Configuration;
23	24	import org.wikimedia.lsearch.config.GlobalConfiguration;
24	25	import org.wikimedia.lsearch.config.IndexId;
	26	+import org.wikimedia.lsearch.config.IndexRegistry;
25	27	import org.wikimedia.lsearch.index.IndexThread;
26	28	import org.wikimedia.lsearch.ranks.Links;
	29	+import org.wikimedia.lsearch.search.NamespaceFilter;
27	30	import org.wikimedia.lsearch.spell.api.Dictionary;
28	31	import org.wikimedia.lsearch.spell.api.Dictionary.Word;
29	32	import org.wikimedia.lsearch.storage.ArticleAnalytics;
—	—	@@ -46,7 +49,7 @@
47	50	System.out.println("MediaWiki Lucene search indexer - build a map of related articles.\n");
48	51
49	52	Configuration.open();
50		~~- if(args.length > 2 && args.length < 1){~~
	53	+ if(args.length > 2 \|\| args.length < 1){
51	54	System.out.println("Syntax: java RelatedBuilder <dbname> [<dump file>]");
52	55	return;
53	56	}
—	—	@@ -64,7 +67,7 @@
65	68	if(dumpfile != null)
66	69	rebuildFromDump(dumpfile,iid);
67	70	else
68		~~- rebuildFromTemp(iid);~~
	71	+ rebuildFromLinks(iid);
69	72	} catch (IOException e) {
70	73	log.fatal("Rebuild I/O error: "+e.getMessage());
71	74	e.printStackTrace();
—	—	@@ -83,7 +86,8 @@
84	87	// first pass - titles
85	88	InputStream input = null;
86	89	input = Tools.openInputFile(inputfile);
87		~~- TitleReader tr = new TitleReader(langCode);~~
	90	+ NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);
	91	+ TitleReader tr = new TitleReader(iid,langCode,nsf);
88	92	XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
89	93	reader.readDump();
90	94	input.close();
—	—	@@ -104,32 +108,42 @@
105	109	* Rebuild related articles index for iid
106	110	* @throws IOException
107	111	*/
108		~~- public static void rebuildFromTemp(IndexId iid) throws IOException {~~
	112	+ public static void rebuildFromLinks(IndexId iid) throws IOException {
109	113	CompactLinks links = new CompactLinks();
110		~~- Links temp = Links.openExisting(iid);~~
	114	+ Links temp = Links.openForRead(iid,iid.getLinks().getImportPath());
111	115
112		~~- log.info("Reading all titles");~~
	116	+ NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);
	117	+ log.info("Reading titles in default search");
113	118	Dictionary dict = temp.getKeys();
114	119	Word w;
115	120	HashMap<Integer,CompactArticleLinks> keyCache = new HashMap<Integer,CompactArticleLinks>();
116	121	while((w = dict.next()) != null){
117	122	String key = w.getWord();
118		~~- links.add(key,temp.getNumInLinks(key));~~
119		~~- keyCache.put(temp.getDocId(key),links.get(key));~~
	123	+ int ns = Integer.parseInt(key.substring(0,key.indexOf(':')));
	124	+ if(nsf.contains(ns)){
	125	+ links.add(key,temp.getNumInLinks(key));
	126	+ keyCache.put(temp.getDocId(key),links.get(key));
	127	+ }
120	128	}
121	129
122	130	log.info("Reading in/out links");
123	131	dict = temp.getKeys();
124	132	while((w = dict.next()) != null){
125	133	String key = w.getWord();
126		~~- CompactArticleLinks l = links.get(key);~~
127		~~- // inlinks~~
128		~~- l.setInLinks(temp.getInLinks(l,keyCache));~~
129		~~- // outlinks~~
130		~~- ArrayList<CompactArticleLinks> out = new ArrayList<CompactArticleLinks>();~~
131		~~- for(String k : temp.getOutLinks(key).toCollection())~~
132		~~- out.add(links.get(k));~~
133		~~- l.setOutLinks(out);~~
	134	+ int ns = Integer.parseInt(key.substring(0,key.indexOf(':')));
	135	+ if(nsf.contains(ns)){
	136	+ CompactArticleLinks l = links.get(key);
	137	+ // inlinks
	138	+ l.setInLinks(temp.getInLinks(l,keyCache));
	139	+ // outlinks
	140	+ ArrayList<CompactArticleLinks> out = new ArrayList<CompactArticleLinks>();
	141	+ for(String k : temp.getOutLinks(key).toCollection()){
	142	+ CompactArticleLinks cs = links.get(k);
	143	+ if(cs != null)
	144	+ out.add(cs);
	145	+ }
	146	+ l.setOutLinks(out);
	147	+ }
134	148	}
135	149	temp.close();
136	150	temp = null; // GC
—	—	@@ -144,14 +158,19 @@
145	159	RelatedStorage store = new RelatedStorage(iid);
146	160	int num = 0;
147	161	int total = links.getAll().size();
148		~~- for(CompactArticleLinks cs : links.getAll()){~~
	162	+ NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);
	163	+ for(CompactArticleLinks cs : links.getAll()){
149	164	num++;
150	165	if(num % 1000 == 0)
151		~~- log.info("Storing ["+num+"/"+total+"]");~~
152		~~- ArrayList<CompactRelated> rel = getRelated(cs,links);~~
153		~~- if(rel.size() == 0)~~
154		~~- continue;~~
155		~~- store.addRelated(cs.toString(),rel);~~
	166	+ log.info("Storing ["+num+"/"+total+"]");
	167	+ Title t = new Title(cs.getKey());
	168	+ // do analysis only for default search namespace (usually main namespace)
	169	+ if(nsf.contains(t.getNamespace())){
	170	+ ArrayList<CompactRelated> rel = getRelated(cs,links);
	171	+ if(rel.size() == 0)
	172	+ continue;
	173	+ store.addRelated(cs.toString(),rel);
	174	+ }
156	175	}
157	176	store.snapshot();
158	177	}
—	—	@@ -161,15 +180,19 @@
162	181	*/
163	182	public static ArrayList<CompactRelated> getRelated(CompactArticleLinks cs, CompactLinks links){
164	183	ArrayList<CompactRelated> ret = new ArrayList<CompactRelated>();
165		-
166		~~- HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>();~~
	184	+
	185	+ HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>();
	186	+ double maxnorm = 0; // maximal value for related score, used for scaling
167	187	if(cs.linksIn != null){
168		~~- for(CompactArticleLinks csl : cs.linksIn)~~
	188	+ for(CompactArticleLinks csl : cs.linksIn){
169	189	ll.add(csl);
	190	+ maxnorm += 1.0/norm(csl.links);
	191	+ }
170	192	}
171	193	for(CompactArticleLinks from : ll){
172	194	if(from != cs){
173		~~- double score = relatedScore(cs,ll,from);~~
	195	+ double rscore = relatedScore(cs,ll,from);
	196	+ double score = (rscore / maxnorm) * rscore;
174	197	if(score != 0)
175	198	ret.add(new CompactRelated(cs,from,score));
176	199	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/LinkReader.java
—	—	@@ -80,34 +80,14 @@
81	81	rank = links.get(key);
82	82	if(rank != null)
83	83	return rank;
84		~~- // try lowercase~~
85		~~- key = ns+":"+title.toLowerCase();~~
86		~~- rank = links.get(key);~~
87		~~- if(rank != null)~~
88		~~- return rank;~~
89	84	// try lowercase with first letter upper case
90	85	if(title.length()==1)
91	86	key = ns+":"+title.toUpperCase();
92	87	else
93		~~- key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase();~~
	88	+ key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1);
94	89	rank = links.get(key);
95	90	if(rank != null)
96	91	return rank;
97		~~- // try title case~~
98		~~- key = ns+":"+WordUtils.capitalize(title);~~
99		~~- rank = links.get(key);~~
100		~~- if(rank != null)~~
101		~~- return rank;~~
102		~~- // try upper case~~
103		~~- key = ns+":"+title.toUpperCase();~~
104		~~- rank = links.get(key);~~
105		~~- if(rank != null)~~
106		~~- return rank;~~
107		~~- // try capitalizing at word breaks~~
108		~~- key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});~~
109		~~- rank = links.get(key);~~
110		~~- if(rank != null)~~
111		~~- return rank;~~
112	92
113	93	return null;
114	94	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedTitle.java
—	—	@@ -1,10 +1,13 @@
2	2	package org.wikimedia.lsearch.related;
3	3
	4	+import java.util.ArrayList;
	5	+
4	6	import org.wikimedia.lsearch.beans.Title;
5	7
6	8	public class RelatedTitle {
7	9	protected Title related;
8	10	protected double score;
	11	+ protected ArrayList<String> contexts = null;
9	12
10	13	public RelatedTitle(Title related, double score) {
11	14	this.related = related;
—	—	@@ -22,6 +25,12 @@
23	26	public void setScore(double score) {
24	27	this.score = score;
25	28	}
	29	+ public ArrayList<String> getContexts() {
	30	+ return contexts;
	31	+ }
	32	+ public void setContexts(ArrayList<String> contexts) {
	33	+ this.contexts = contexts;
	34	+ }
26	35	@Override
27	36	public String toString() {
28	37	return related.toString()+" ("+score+")";
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/TitleReader.java
—	—	@@ -12,6 +12,8 @@
13	13	import org.mediawiki.importer.Revision;
14	14	import org.mediawiki.importer.Siteinfo;
15	15	import org.wikimedia.lsearch.beans.ArticleLinks;
	16	+import org.wikimedia.lsearch.config.IndexId;
	17	+import org.wikimedia.lsearch.search.NamespaceFilter;
16	18	import org.wikimedia.lsearch.util.Localization;
17	19
18	20	/**
—	—	@@ -25,9 +27,13 @@
26	28	Revision revision;
27	29	CompactLinks links = new CompactLinks();
28	30	protected String langCode;
	31	+ protected IndexId iid;
	32	+ protected NamespaceFilter nsf;
29	33
30		~~- public TitleReader(String langCode){~~
	34	+ public TitleReader(IndexId iid, String langCode, NamespaceFilter nsf){
31	35	this.langCode = langCode;
	36	+ this.iid = iid;
	37	+ this.nsf = nsf;
32	38	}
33	39
34	40	public void writeRevision(Revision revision) throws IOException {
—	—	@@ -37,8 +43,10 @@
38	44	this.page = page;
39	45	}
40	46	public void writeEndPage() throws IOException {
41		~~- String key = page.Title.Namespace+":"+page.Title.Text;~~
42		~~- links.add(key,0);~~
	47	+ if(nsf.contains(page.Title.Namespace)){
	48	+ String key = page.Title.Namespace+":"+page.Title.Text;
	49	+ links.add(key,0);
	50	+ }
43	51	}
44	52	public CompactLinks getTitles() {
45	53	return links;
—	—	@@ -54,7 +62,7 @@
55	63	Iterator it = info.Namespaces.orderedEntries();
56	64	while(it.hasNext()){
57	65	Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
58		~~- Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);~~
	66	+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),iid.getDBname());
59	67	}
60	68	}
61	69	public void writeStartWiki() throws IOException {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java
—	—	@@ -73,7 +73,7 @@
74	74	log.debug("Calling remotely indexUpdate("+myhost+","+iid+") on "+host);
75	75	r.indexUpdated(myhost,iid.toString());
76	76	} catch (Exception e) {
77		~~- log.warn("Error invoking remote method notifyIndexUpdated() on host "+host);~~
	77	+ log.warn("Error invoking remote method notifyIndexUpdated() on host "+host+" : "+e.getMessage());
78	78	continue;
79	79	}
80	80	}
—	—	@@ -102,10 +102,10 @@
103	103	log.debug("Got new RMI messenger for host "+host);
104	104	return r;
105	105	} catch (RemoteException e) {
106		~~- log.warn("Cannot contact RMI registry for host "+host);~~
	106	+ log.warn("Cannot contact RMI registry for host "+host+" : "+e.getMessage());
107	107	throw e;
108	108	} catch (NotBoundException e) {
109		~~- log.warn("No RMIMessenger instance at host "+host);~~
	109	+ log.warn("No RMIMessenger instance at host "+host+" : "+e.getMessage());
110	110	throw e;
111	111	}
112	112	}
—	—	@@ -126,7 +126,7 @@
127	127	return res;
128	128	} catch (Exception e) {
129	129	//e.printStackTrace();
130		~~- log.warn("Error invoking remote method getIndexTimestamp() on host "+host);~~
	130	+ log.warn("Error invoking remote method getIndexTimestamp() on host "+host+" : "+e.getMessage());
131	131	}
132	132	return null;
133	133	}
—	—	@@ -137,7 +137,7 @@
138	138	log.debug("Calling enqueueUpdateRecords("+records.length+" records) on "+host);
139	139	r.enqueueUpdateRecords(records);
140	140	} catch (Exception e) {
141		~~- log.warn("Error invoking remote method enqueueUpdateRecords() on host "+host);~~
	141	+ log.warn("Error invoking remote method enqueueUpdateRecords() on host "+host+" : "+e.getMessage());
142	142	throw e;
143	143	}
144	144	}
—	—	@@ -148,7 +148,7 @@
149	149	log.debug("Calling enqueueFrontend("+records.length+" records) on "+host);
150	150	r.enqueueFrontend(records);
151	151	} catch (Exception e) {
152		~~- log.warn("Error invoking remote method enqueueFrontend() on host "+host);~~
	152	+ log.warn("Error invoking remote method enqueueFrontend() on host "+host+" : "+e.getMessage());
153	153	throw e;
154	154	}
155	155	}
—	—	@@ -159,7 +159,7 @@
160	160	log.debug("Calling reportBack("+cards.length+" records) on "+host);
161	161	r.reportBack(cards);
162	162	} catch (Exception e) {
163		~~- log.warn("Error invoking remote method sendReports on host "+host);~~
	163	+ log.warn("Error invoking remote method sendReports on host "+host+" : "+e.getMessage());
164	164	}
165	165	}
166	166
—	—	@@ -177,7 +177,7 @@
178	178	cache.invalidateSearchable(iid,host);
179	179	SearchResults res = new SearchResults();
180	180	res.retry();
181		~~- log.warn("Error invoking remote method searchPart on host "+host);~~
	181	+ log.warn("Error invoking remote method searchPart on host "+host+" : "+e.getMessage());
182	182	return res;
183	183	}
184	184	}
—	—	@@ -188,7 +188,7 @@
189	189	log.debug("Calling requestFlushAndNotify("+dbname+" records) on "+host);
190	190	return r.requestFlushAndNotify(dbname);
191	191	} catch (Exception e) {
192		~~- log.warn("Error invoking remote method requestFlushAndNotify on host "+host);~~
	192	+ log.warn("Error invoking remote method requestFlushAndNotify on host "+host+" : "+e.getMessage());
193	193	return false;
194	194	}
195	195	}
—	—	@@ -199,7 +199,7 @@
200	200	log.debug("Calling isSuccessfulFlush("+dbname+" records) on "+host);
201	201	return r.isSuccessfulFlush(dbname);
202	202	} catch (Exception e) {
203		~~- log.warn("Error invoking remote method isSuccessfulFlush on host "+host);~~
	203	+ log.warn("Error invoking remote method isSuccessfulFlush on host "+host+" : "+e.getMessage());
204	204	throw new IOException("Remote error");
205	205	}
206	206	}
—	—	@@ -212,7 +212,7 @@
213	213	log.debug(" \\-> got: "+size);
214	214	return size;
215	215	} catch (Exception e) {
216		~~- log.warn("Error invoking remote method getIndexerQueueSize on host "+host);~~
	216	+ log.warn("Error invoking remote method getIndexerQueueSize on host "+host+" : "+e.getMessage());
217	217	return -1;
218	218	}
219	219	}

Status & tagging log

15:21, 12 September 2011 Meno25 (talk | contribs) changed the status of r26333 [removed: ok added: old]