Index: branches/lucene-search-2.1/lsearch-global.conf |
— | — | @@ -18,19 +18,20 @@ |
19 | 19 | wikilucene : (nssplit,3) (nspart1,[0]) (nspart2,[4,5,12,13]), (nspart3,[]) |
20 | 20 | wikilucene : (language,en) (warmup,10) |
21 | 21 | wikilucene : (spell,3,1) (prefix) |
| 22 | +enwiki,viwiki,srwiki,eswiki,dewiki,mlwiki,zhwiki,jawiki,itwiki,thwiki : (single) |
22 | 23 | |
23 | 24 | # Search groups |
24 | 25 | # Index parts of a split index are always taken from the node's group |
25 | 26 | # host : db1.part db2.part |
26 | 27 | # Mulitple hosts can search multiple dbs (N-N mapping) |
27 | 28 | [Search-Group] |
28 | | -oblak : wikilucene wikidev wikilucene.prefix |
| 29 | +oblak : wikilucene wikidev wikilucene.prefix wikilucene.related wikilucene.links |
29 | 30 | |
30 | 31 | # Index nodes |
31 | 32 | # host: db1.part db2.part |
32 | 33 | # Each db.part can be indexed by only one host |
33 | 34 | [Index] |
34 | | -oblak: wikilucene wikidev |
| 35 | +oblak: enwiki wikilucene wikidev viwiki srwiki eswiki dewiki mlwiki zhwiki jawiki itwiki thwiki |
35 | 36 | |
36 | 37 | # Rsync path where indexes are on hosts, after default value put |
37 | 38 | # hosts where the location differs |
Index: branches/lucene-search-2.1/test-data/indexing-articles.test |
— | — | @@ -391,3 +391,419 @@ |
392 | 392 | [[tr:Maxwell denklemleri]] |
393 | 393 | [[zh:麦克斯韦方程组]] |
394 | 394 | |
| 395 | +### namespace = 0 |
| 396 | +### title = Douglas Adams |
| 397 | +### content |
| 398 | + |
| 399 | +==Education and early works== |
| 400 | +[[Image:Douglas Adams Sign from HH cover.jpg|200px|right|thumb|Douglas Adams was known to some fans as ''Bop Ad'' - after his illegible signature]]Adams first attended Primrose Hill Primary School in Brentwood. He took the exams and interviewed for [[Brentwood School (England)|Brentwood School]] at age six, and attended the [[Preparatory school (UK)|preparatory school]] from 1959 to 1964, then the main school until 1970. He was in the top stream, and specialised in the arts in the sixth form, after which he stayed an extra term in a special seventh form class, customary in the school for those preparing for [[Oxbridge]] entrance exams. |
| 401 | + |
| 402 | +While at the prep school, his English teacher, Frank Halford, reportedly |
| 403 | +awarded Adams the only ten out of ten of his entire teaching career for a |
| 404 | +creative writing exercise.<ref>http://www.bbc.co.uk/dna/h2g2/A3790659</ref> |
| 405 | +Adams remembered this for the rest of his life, especially when facing writer's |
| 406 | +block.<ref>{{cite book | author=Adams, Douglas | title=The Salmon of Doubt: |
| 407 | +Hitchhiking the Galaxy One Last Time | edition=US mass market paperback edition | |
| 408 | +publisher=Ballantine | year=2005 | pages=Page xix | id=ISBN 0-345-45529-0}}</ref> |
| 409 | +Some of Adams' earliest writing was published at the school, such as a report on |
| 410 | +the school's Photography Club in ''The Brentwoodian'' (in 1962) or spoof reviews |
| 411 | +in the school magazine ''Broadsheet'' (edited by [[Paul Neil Milne Johnstone]]). |
| 412 | +Adams also had a letter and short story published nationally in the UK in the boys' |
| 413 | +magazine ''The Eagle'' in 1965. He met [[Griff Rhys Jones]], who was in the year below, |
| 414 | +at the school, and was in the same class as "Stuckist" artist |
| 415 | +[[Charles Thomson (artist)|Charles Thomson]]; all three appeared together in |
| 416 | +a production of Shakespeare's Julius Caesar in 1968. He was six feet tall (1.83 m) |
| 417 | +by the time he was 12, and he stopped growing only at 6'5" (1.96 m). |
| 418 | +Later, he would often make self-ironic jokes about his own towering stature, |
| 419 | +"...the form-master wouldn't say 'Meet under the clock tower,' or |
| 420 | +'Meet under the War Memorial,' but 'Meet under Adams.'" |
| 421 | +<ref>{{cite book | author=Adams, Douglas | title=The Salmon of Doubt: |
| 422 | +Hitchhiking the Galaxy One Last Time | edition=First UK hardcover edition | |
| 423 | +publisher=Macmillan | year=2002 | pages=Page 7 | id=ISBN 0-333-76657-1}}</ref> |
| 424 | + |
| 425 | +### namespace = 0 |
| 426 | +### title = Aaliyah |
| 427 | +### content |
| 428 | + |
| 429 | +{{Two other uses||Aaliyah's self-titled album|Aaliyah (album)||Aliyah (disambiguation)}} |
| 430 | +{{Infobox musical artist <!-- See Wikipedia:WikiProject Musicians --> |
| 431 | +| Name = Aaliyah |
| 432 | +| Img = Aaliyah5301.jpg<!--fair use image to be used only in this article--> |
| 433 | +| Img_capt = Promotional photo of Aaliyah from May 2001 |
| 434 | +| Img_size = <!-- Only for images narrower than 220 pixels --> |
| 435 | +| Landscape = |
| 436 | +| Background = solo_singer |
| 437 | +| Birth_name = Aaliyah Dana Haughton<ref name="Aaliyah NNDB Profile">{{cite web| url =http://www.nndb.com/people/742/000024670/| title = Aaliyah NNDB Profile| publisher =NNDB| accessdate =2007-03-03}}</ref> |
| 438 | +| Alias = Baby Girl, The Princess of Hip-Hop Soul<br>Liyah<br>Wonder Woman<ref>[http://www.imdb.com/name/nm0004691/bio Aaliyah on IMDb]</ref> |
| 439 | +| Born = {{birth date|1979|1|16}}<br><small>[[Brooklyn]], [[New York]], [[United States|U.S.]]</small> |
| 440 | +| Died = {{death date and age|2001|8|25|1979|1|16}}<br><small>[[Abaco Islands]], [[The Bahamas]]</small> |
| 441 | +| Origin = {{Flagicon|USA}} [[Detroit, Michigan|Detroit]], [[Michigan]], [[United States|U.S.]] |
| 442 | +| Genre = [[R&B]], [[Hip hop soul]], [[Dance music|Dance]], [[Pop music|Pop]] |
| 443 | +| Occupation = [[Singer]], [[Model (person)|model]], [[dancer]], [[actress]] |
| 444 | +| Years_active = 1994 – 2001 |
| 445 | +| Label = [[Blackground Records|Blackground]] |
| 446 | +| Associated_acts = [[Missy Elliott]], [[Timbaland]], [[Steve "Static" Garrett|Static]], [[R. Kelly]], [[Ginuwine]], [[Tweet (singer)|Tweet]] |
| 447 | +| URL = [http://www.aaliyah.com Aaliyah.com] |
| 448 | +}} |
| 449 | +'''Aaliyah Dana Haughton'''<ref name="Aaliyah NNDB Profile"/> ([[January 16]], [[1979]] – [[August 25]], [[2001]]), known professionally as '''Aaliyah''', was a [[Grammy Award]] winning [[United States|American]] [[singer]], [[dancer]], [[Model (person)|model]] and [[actress]]. Introduced to audiences by R&B singer [[R. Kelly]], Aaliyah became famous during the mid-1990s with several hit records from the songwriting/production team of [[Missy Elliott]] & [[Timbaland]] and their associate [[Steve "Static" Garrett]]. Aaliyah soon joined Timbaland's R&B and hip hop collective, the [[Superfriends Clique]]. |
| 450 | + |
| 451 | +Notable for recording several hit records, including several number one R&B hits, a number one pop hit, and nine top 10 singles on the [[Billboard Hot 100]]. She also modeled for [[Tommy Hilfiger]] and starred in two [[motion pictures]] before dying in a plane crash in [[the Bahamas]]. |
| 452 | + |
| 453 | +==Early years== |
| 454 | +Aaliyah Dana Haughton was born in Bedford Stuyvesant, Brooklyn, New York on January 16, 1979 to Michael and Diane Haughton, and was raised in Detroit, Michigan. Her name means "Highest, Most Exalted, The Best" in [[Arabic language|Arabic]] or "to ascend" in [[Hebrew]]. Aaliyah was brought up as a [[Catholic]] with her older brother [[Rashad Haughton]]. Her grandmother, Mintis L. Hicks Hankerson, was of [[African American]] and [[Native Americans in the United States|Native American]] descent. Diane Haughton, Aaliyah's mother, also a vocalist, encouraged her daughter's career. Her uncle, [[Barry Hankerson]], is a prominent individual in the music industry and Aaliyah's aunt, through marriage to Hankerson, is [[Gladys Knight]], a legendary soul singer with [[Gladys Knight & the Pips]]. |
| 455 | + |
| 456 | +She appeared on the TV talent show ''Star Search'' at age ten, singing her mother's favorite song, "My Funny Valentine". Although she did not win, Aaliyah worked with an agent in New York and began to attend auditions for TV shows, including ''[[Family Matters (TV series)|Family Matters]]''. |
| 457 | + |
| 458 | +Following her appearance on ''Star Search'' Aaliyah performed on stage in [[Las Vegas]] with Gladys Knight. In her early teens, Aaliyah attended the Detroit High School for the Fine and Performing Arts, and graduated as a dance major with a 4.0 GPA |
| 459 | + |
| 460 | +==''Age Ain't Nothing But a Number'' (1994)== |
| 461 | +[[Image:Aaliyah-age-aint-94.jpg|right|200px|thumb|Cover of ''[[Age Ain't Nothing but a Number]]''.]] |
| 462 | +Aaliyah signed with her uncle [[Barry Hankerson]]'s [[Blackground Records]] label in 1993 at the age of 14. She released her debut album, titled ''[[Age Ain't Nothing but a Number]]'', in 1994 at the age of 15. [[R. Kelly]], Aaliyah's then alleged husband, was a leading songwriter and producer on her debut album. The album displayed her smooth and velvety vocals and the production work was said to be original and innovative. The album went [[platinum album|platinum]] within months. The album featured the gold-selling singles "[[Back and Forth (Aaliyah song)|Back and Forth]]" (#1 U.S. R&B, 3 weeks and #5 [[Hot 100]] ), "[[At Your Best]]" (#2 U.S. R&B and #6 [[Hot 100]] ), a cover of [[The Isley Brothers]]' 1976 song, the album-titled single "[[Age Ain't Nothing But A Number (song)|Age Ain't Nothing But A Number]]" (#75 [[Hot 100]] and #35 US R&B, 2 weeks), and "Down with the Clique" (#33 UK Top 75 Singles). "Back and Fourth" was sampled by [[Madonna (entertainer)|Madonna]] for the track, "[[Inside of Me]]" which appears on her 1994 album ''[[Bedtime Stories]]''. In June 1995, Aaliyah released another single to radio only, "No One Knows How to Love Me Quite Like You Do." The album has sold over 3.5 million copies in the U.S. to date and nearly 7 million worldwide. |
| 463 | + |
| 464 | +In 1994, a rumour surfaced that 15-year-old Aaliyah and 27-year-old R. Kelly had secretly married in the state of [[Illinois]]. Both initially denied. Although many websites and television shows claimed that they found a marriage certificate, it never has been truly proven that R. Kelly and Aaliyah have ever been married. |
| 465 | + |
| 466 | +==Guest appearances, movie roles and soundtracks (1995 - 2001)== |
| 467 | +[[Image:romeo must die dvd.jpg|150px|left|thumb|Aaliyah on the cover of her first film ''[[Romeo Must Die]] {{speedy-image-c}}'' alongside [[Jet Li]]]] |
| 468 | +In 1995 at age 16, Aaliyah performed "[[The Star-Spangled Banner]]" live at an [[Orlando Magic]] basketball game. Also during that year, she appeared on the soundtrack for ''[[A Low Down Dirty Shame]]'' with the minor international hit "The Thing I Like" (#33 UK). The song was also included on international versions of [[Age Ain't Nothing But A Number]]. |
| 469 | + |
| 470 | +In 1997, Aaliyah appeared on the soundtrack album for the [[Fox Animation Studios]] animated feature ''[[Anastasia (1997 movie)|Anastasia]]'', singing the pop version of "[[Journey to the Past]]". The song was nominated for an [[Academy Awards|Academy Award]], and Aaliyah performed the song at the 1998 Academy Awards ceremony, becoming the youngest female recording artist to perform at the ceremony. Not only was Aaliyah the youngest female to perform but she was the youngest African American to have the nominee for [[Academy Award for Best Original Song|Best Original Song]]. |
| 471 | + |
| 472 | +Aaliyah had a hit in 1998 with "[[Are You that Somebody]]" (number one airplay U.S. eight weeks), the main single from the ''[[Dr. Dolittle (film)|Dr. Dolittle]]'' soundtrack. Its video was the third most-played on [[MTV]] that year, and the song's success helped make Aaliyah a household name (and making her crowned as Queen of Urban Pop). |
| 473 | + |
| 474 | +In 2000, she co-starred with [[Jet Li]] in the [[martial arts]] film ''[[Romeo Must Die]]'', which debuted at number one at the box office. Aaliyah and Timbaland executive produced the film's soundtrack album and Aaliyah contributed four songs: "Are You Feelin' Me?," "I Don't Wanna," "Come Back in One Piece," a duet with [[DMX (rapper)|DMX]], and the international number one hit "[[Try Again]]." Aaliyah made history once more when "Try Again" became the first song to ever reach number one on the Billboard Hot 100 based solely on the strength of its radio airplay, without any single sales factored in. After the huge success of "Try Again" at radio, a [[12 inch single|12" maxi single]] was released for consumer purchase. The radio-only single, "I Don't Wanna", (which was also featured on the soundtracks for the films ''[[Next Friday]]'' and ''Romeo Must Die'') peaked at number five on the Billboard Hot R&B/Hip Hop Singles & Tracks chart. |
| 475 | + |
| 476 | +In 2001, Aaliyah went to [[Australia]] to co-star with [[Stuart Townsend]] in the film ''[[Queen of the Damned (film)|Queen of the Damned]]'', an adaptation of the [[Anne Rice]] novel of the same name. Aaliyah also recorded most of her third studio album, ''[[Aaliyah (album)|Aaliyah]]'', during this time. |
| 477 | + |
| 478 | +==''One in a Million'' (1996)== |
| 479 | +[[Image:aaliyah-one-in-a-million.jpg|200px|right|thumb|Cover of ''[[One in a Million (album)|One In A Million]]''.]] |
| 480 | +''[[One in a Million (album)|One In A Million]]'', Aaliyah's sophomore album, was chiefly written and produced by then unknowns [[Missy Elliott]] and [[Timbaland|Timothy "Timbaland" Mosley]] and released on [[August 27]], [[1996]] when she was 17 years old. The album was a landmark in Aaliyah's career, garnering her mass critical acclaim and introducing Aaliyah's more mature side. It embarked the newfound chemistry of Aaliyah and Timbaland. The album was certified double-platinum within a year, making Aaliyah a major R&B star and igniting the successful careers of Missy Elliott and Timbaland. ''One in a Million'' featured the international smash hit "[[If Your Girl Only Knew]]" (number one U.S. R&B, 2 weeks and #11 Hot 100), "[[One in a Million (Aaliyah song)|One In A Million]]," (#1 U.S. R&B airplay, six weeks & #25 US Hot 100 Airplay), the #8 U.S. R&B and #9 [[Hot 100]] single "[[The One I Gave My Heart To]]," a ballad written by [[Diane Warren]], "[[4 Page Letter]]" (#12 R&B Airplay), "[[Hot Like Fire (Timbaland Remix)|Hot Like Fire]]" (two versions) (#31 R&B Airplay), and "[[Got to Give It Up (Aaliyah song)|Got To Give It Up]](#37 UK)" (a remake of the [[1977]] [[Marvin Gaye]] song). |
| 481 | + |
| 482 | +[[Tommy Hilfiger]] gave Aaliyah her first endorsement deal. He signed Aaliyah onto print campaigns, runway shows, and a commercial. During this period, Aaliyah would also make guest appearances on albums by artists such as [[Missy Elliott]], [[Timbaland & Magoo]], [[Ginuwine]] and [[Playa (band)|Playa]]. [[Timbaland]] and [[Playa]]'s frontman [[Steve "Static" Garrett]] would remain Aaliyah's principal collaborators for the duration of her career. To date, ''One in a Million'' has sold over 3.7 million copies in the U.S. and over 11 million worldwide. |
| 483 | + |
| 484 | +After the success of ''One in a Million'', in 1997 Aaliyah headlined in her own tour "The Hot Like Fire Tour", in which she toured various major city venues performing hits from Her albums. |
| 485 | + |
| 486 | +==''Aaliyah'' (2001)== |
| 487 | +<!--[[Image:Aaliyah-ep-2001.jpg|200px|left|thumb|Cover of [[Aaliyah (album)|Aaliyah]] commenting out image with no source/bad FairUse claim--> |
| 488 | +"[[We Need a Resolution (Aaliyah song)|We Need a Resolution]]," the first single from Aaliyah's third studio album, was released [[April 24]], 2001 (see [[2001 in music]]). The self-titled ''[[Aaliyah (album)|Aaliyah]]'' was released three months later on [[July 17]], [[2001]]. The album was an instant critical success but sales were initially lower than expected, although they increased considerably after her death. ''Aaliyah'' introduced a darker and edgier side to Aaliyah's music and was noted as having showcased her growth as an artist. Around the time of the album's recording and release she had been filming ''Queen of the Damned'', which helped her show a dark and edgy side as her character was a deadly villain. The album debuted at number two on the [[Billboard 200]] chart, selling 190,000 copies in its first week, and was certified gold (500,000 copies sold) within four weeks, before her death. The week after the plane crash it climbed to number one. [[Trent Reznor]] of [[Nine Inch Nails]] was to produce a song on the album but scheduling conflicts did not permit the collaboration. |
| 489 | + |
| 490 | +==="More Than A Woman" and "Rock The Boat"=== |
| 491 | +There was no shortage of confusion at the label regarding the next single from the Aaliyah album. Aaliyah had been promoting "[[More Than a Woman (Aaliyah song)|More Than a Woman]]", having performed it twice and shooting a video with director Dave Meyers in the summer of 2001. According to Blackground, a remix featuring State Property and Jay-Z was also planned, but was scrapped due to lack of adequate funds. The video was to be released but "[[Rock the Boat (Aaliyah song)|Rock the Boat]]" began receiving huge amounts of radio-play, so she was immediately sent to the Bahamas to shoot the video. The "[[Rock The Boat]]" music video was put in the 106 and Park hall of fame, making the countdown over 65 times and landed at #2 on [[BET]]'s Top 100 videos of 2001. "[[More Than a Woman]]" made the number-one spot after "Rock the Boat" was retired. "Rock the Boat" was #2 U.S. R&B and #14 Hot 100 single. "I Care 4 U" was #3 U.S. R&B and #16 Hot 100 single. The album went on to sell over 8 million copies worldwide. |
| 492 | + |
| 493 | +==Death== |
| 494 | +On August 25, 2001, at 6:49 pm, just after wrapping up filming of the "Rock the Boat" video, Aaliyah and various members of her record company boarded a twin engine [[Cessna 402]]B (N8097W) at Marsh Harbour, Abaco Island, Bahamas to travel to Opa-locka Airport near Miami, Florida, but the plane crashed shortly after takeoff about 200 feet from the runway. Pilot Luis Morales III and all eight passengers, including Aaliyah, were killed in the aviation incident. According to findings from an inquest conducted by the coroner's office in the Bahamas, Aaliyah suffered from "severe burns and a blow to the head," in addition to severe shock. The coroner theorized that, even if Aaliyah survived the crash, her recovery would have been virtually impossible given the severity of her injuries.<ref>[http://www.caribbeannetnews.com/2003/11/21/aaliyah.htm Bahamas Coroner delivers verdict in Aaliyah death crash.] (2003, November 21). ''Caribbean News''. Retrieved February 9, 2007.</ref> |
| 495 | + |
| 496 | +Aaliyah's eulogy was held on August 31 at Saint Ignatius Loyola Roman Catholic Church on East 84th Street in Manhattan. A horse-drawn carriage then carried her coffin to [[Ferncliff Cemetery]] in Hartsdale, New York, where she was initially interred in a crypt in the extension wing of the main mausoleum. When the Rosewood Mausoleum was completed a couple of years later, Aaliyah was moved to a private room in the new building. The inscription of her alias ''Baby Girl'' is engraved on her crypt. |
| 497 | + |
| 498 | +===Investigation=== |
| 499 | +{{Copypaste}} |
| 500 | + |
| 501 | +NTSB reports indicate that the pilot, Luis Morales III, was not qualified to pilot the plane he was attempting to fly. Morales falsely obtained his FAA license by showing hundreds of hours never flown, and he may also have falsified how many hours he had flown in order to get a job with his employer, Blackhawk International Airways. Additionally, an autopsy performed on Morales revealed cocaine and alcohol in his blood. |
| 502 | + |
| 503 | +Further investigations determined the plane was over its total gross weight by several hundred pounds. Although witnesses claimed that the passengers had been asked to leave some luggage behind, it was later discovered that the passengers, including Aaliyah, had not been informed of the excess weight. |
| 504 | + |
| 505 | +Eddie Golson, president of Pro Freight Cargo Services at Opa-locka Airport, said workers carted "a pickup truck of freight" from the crash site Monday. "That's absurd to think that this pilot got in this airplane with eight other people and a truck full of freight and expected this thing to fly," Golson said. "What the hell was going on?" a baggage handler was reported to have said, in reaction to hearing that no one weighed the passengers or baggage. Two of the passengers, members of Aaliyah's entourage, weighed in the neighborhood of 300 pounds and sat in the rear of the plane, where the baggage was also stored. |
| 506 | + |
| 507 | +The day of the aviation incident was Mr. Morales' first official day with Blackhawk International Airways, a Part 135 single-pilot operation. He had been employed with Golden Airlines, from which he was fired only four hours before the fatal aviation incident. In addition, Luis Morales III was not registered with the FAA to fly for Blackhawk. As a result of the incident, a wrongful death lawsuit was filed by Aaliyah’s parents and was later settled out of court for an undisclosed amount. |
| 508 | + |
| 509 | +Barry & Sons, Inc., a corporation formed in 1992 to develop, promote and capitalize on the musical talents of Aaliyah and to oversee the production and distribution of her records, tapes and music videos, brought an unsuccessful lawsuit in the Supreme Court of the State of New York against Instinct Productions LLC, (a company hired by Barry & Sons, Inc. in August, 2001 to produce the "Rock the Boat" music video). The case was dismissed since New York State's wrongful death statute only permits certain people to recover damages for wrongful death.<ref>[http://www.courts.state.ny.us/reporter/3dseries/2005/2005_00096.htm Text of appellate division decision dismissing the case.]</ref> |
| 510 | + |
| 511 | +==Legacy== |
| 512 | +"Rock the Boat" went on to become a [[Posthumous work|posthumous]] hit on radio (reaching number two on Billboard's Hot R&B Singles charts, number 14 on the Hot 100, and number 12 in the UK) and video channels, and the news of Aaliyah's death gave her album a notable sales boost, pushing it to number one on the Billboard 200. The album produced two other singles. "More than A Woman" reached number 7 on Billboard's Hot R&B singles chart, number 25 on Hot 100, and number one in the UK. "I Care 4 U" reached number three on Billboard's Hot R&B singles chart and number 16 on the Hot 100, the latter attaining success even without the promotional push of a [[music video]]. The ''Aaliyah'' album went on to sell over 3 million copies in the U.S. ''[[Queen of the Damned (film)|Queen of the Damned]]'' was released in early 2002. Before its release, Aaliyah's brother Rashad was called upon to re-dub several of his sister's lines during the post-production [[Dubbing (music)|ADR]] process. Upon its release, the film debuted at number one. The film was also dedicated to her. |
| 513 | + |
| 514 | +In 2001, Missy Elliott released her video for "[[Miss E... So Addictive|Take Away]]". The video contained words and images about Aaliyah. The single also featured Ginuwine and was the debut of Elliott's recent protégé, [[Tweet (singer)|Tweet]]. |
| 515 | + |
| 516 | +Aaliyah and former [[The Beatles|Beatle]] [[George Harrison]] made UK Chart History in January 2002 when they scored the first, and to this date only, back-to-back posthumous number one hits (aside from the [[Elvis Presley]] re-releases in 2005). Aaliyah's "More than a Woman", released on January 7 and topped the chart on January 13, was followed by Harrison's "My Sweet Lord", re-released on January 14 and topped the chart on January 20. Aaliyah was voted one of "The Top 40 Woman of the Video Era" in VH1's The Greatest, also ranked #36 on their list of the 100 Sexiest Artist. Aaliyah also made E!'s list on the 101 Most Shocking Moments in Entertainment, Juciest Hollywood Hookups, and Best Kept Hollywood Secrets. Aaliyah recently ranked at #18 on BET's "Top 25 Dancers of All Time". In 2005, former Co-Star Jet-Li as reported from CNN stated the memory of Aaliyah had haunted him in Vancouver, where he and the late songstress shot the film [[Romeo Must Die]]. |
| 517 | + |
| 518 | +In addition Aaliyah has been the topic of five books: ''Aaliyah: More than a Woman'' (2001) by Christopher John Farley, ''Aaliyah: An R&B Princess in Words and Pictures'' (2002) by Kelly Kenyatta, ''Aaliyah'' by [[Tim Footman]] (2003), and ''Aaliyah Remembered: Her Life & The Person behind the Mystique'' (2005) by William Sutherland. |
| 519 | + |
| 520 | +"Her legacy is, you can achieve your dreams and still maintain being a beautiful person" |
| 521 | +-Brother [[Rashad Haughton]] |
| 522 | + |
| 523 | +===Lyrical Dedications=== |
| 524 | + |
| 525 | +* [[Boyz II Men]]: "Think Of Aaliyah" a.k.a. "The Aaliyah Song" - "''When you think of Aaliyah, laugh, don't cry, cause I know she want it that way''". |
| 526 | +* [[Brandy (entertainer)|Brandy]]: "Turn It Up" - ''"Get Baby Girl's attention, she's more than a woman and we sure do miss her. I wanna represent her, Timbaland, Missy, you get the picture."'',"Should I Go" - "''this industry was more like a different world, when it was just me, Monica, and Baby Girl [Aaliyah], I never got to tell you how much you meant / I wish you and me both was sittin' here workin with Tim / Just to be in the presence of people that you affected on a personal level just makes me stop for a second. [inhales and exhales] You were such a blessing, you helped me answer all of my questions."'' |
| 527 | +* [[D12]]: "9-11" - ''"We lost Aaliyah, lost our families, it takes no tenges. You don't need us to see the world is (messed) up, God can see it"'' |
| 528 | +* [[Foxy Brown]]: "Big Bad Mama" - ''"Rhyme deep in footwear, via Spiga/ Like Aaliyah, One in a Million/There's MC's in this rap shit comin in illin/ like I did, laid the groundwork for five hits/ Member when I told y'all first week out/ Shipped a half a mil, niggaz freaked out/ Love yourself, put no one above thee/ Cause ain't nobody gon' fuck me like me, it's on'"'' |
| 529 | + |
| 530 | +* [[Jadakiss]]: "Why" - ''"Why Aaliyah had to take that flight?"'' |
| 531 | +* [[Jay-Z]]: "Miss You Remix" - names certain people who missed her after her death. |
| 532 | +* [[Juelz Santana]]: "One Day I Smile" - ''"Once again a deep thought of Aaliyah crosses my mind"'' |
| 533 | +* [[Kanye West]]: "Never Let Me Down" - ''"But I can't complain what the accident did to my Left Eye / Cuz look what an accident did to Left Eye / First Aaliyah and now Romeo must die / I know I got angels watching me from the other side"'' |
| 534 | +* [[Layzie Bone]]: "For The Thugs That's Gone" - " Too many celebrities perish, these people we love and cherish, and I had a chance to meet Aaliyah, but I was to embarrassed and I should of took a chance, I heard that from a man, Jam Master J was so real, you niggas don't understand, he told me to handle my business, make sure I pay my taxes, a little advice from a legend to keep my paper stacking, and I gotta give props to Eazy, that nigga put me on, if he didn't believe in the thugs you all wouldn't of heard of Bone. |
| 535 | +* [[Lil' Flip]]: "Hall of Fame Graveyard" - ''"From Eazy E to Aaliyah, we even lost Left Eye / How come the wack rappers live but the best die"'' |
| 536 | +* [[Mary J. Blige]] - "[[MJB Da MVP]]" - ''"It was when Aaliyah died / I could hardly sleep / Thought about it every day / and it made Me change my ways"'' |
| 537 | +* [[Missy Elliott]]: "Can You Hear Me?" - ''"I been checkin' on your moms and dad / And your brother since the day you left / Passed on and went away with God / But for your mama it's been / So damn hard / I hate to even hear her cry / Aaliyah she asked me why / Would her baby girl go this way / Can you give me better words to say / Cause One day she'll see you again / With the same old beautiful smile / Long hair and the voice of a hummingbird / You'll be singing them same old songs / Aaliyah can you hear me? / I hope that you're proud of me / Me and Tim we been doing our thing / But it's never been the same / Since you had to go / I ain't never met a friend / More incredible"'' |
| 538 | +* [[Monica (singer)|Monica]]: "Go To Bed Mad" - ''"Argue about things so critical / And you heated over nothing / And just hang up the phone / I want / I wanna talk in the mood / See we need a resolution / Like that Aaliyah song"'' |
| 539 | +* [[Mya]]: "After The Rain" - ''"No one could ever fill your shoes, you're one in a million"'' |
| 540 | +* [[Nas]] featuring [[Quan (rapper)|Quan]]: "Just A Moment" - ''"And can we please have a moment to mourn? / For Pac, Biggie and Pun 'cause through us they live on / Jam Master Jay, Freaky Ty and Aaliyah / Big L and Left Eye, when we die we hope to see ya"'' |
| 541 | +* [[Ray J]]: "War Is Over" - ''"One day one day one day / I hope to see my girl Aaliyah"'' |
| 542 | +* [[The Game (rapper)|The Game]]: "Dreams" - ''"Martin Luther King had a dream, Aaliyah had a dream, Left Eye had a dream"'', "Runnin" - ''"God let me in, give me a room by Aaliyah with ESPN"'' |
| 543 | +* [[T.I.]]: "Rubberband Man" - ''"throw your lighters up for my cousin Toot, Aaliyah, Left Eye, and Jam Master Jay"'' |
| 544 | +* [[TQ]] : "Gone But Not Forgotten" - ''"Aaliyah, I wish we could've did a song, but baby girl when I get my wings, I'm gonna send your precious love"'' |
| 545 | +* [[Wyclef Jean]]: "Industry" - ''"Back and forth and forth and back / Like Miss Aaliyah man do I miss her"'' |
| 546 | +* [[Outkast]] ft. [[Killer Mike]]: "The Whole World" - ''"Mami, I'm coming, I hope u get off / Or rock your own boat like Aaliyah don taught / Back, back and forth, forth / Get that sailor on course course"'' |
| 547 | +* French R'n'b singer [[Assia]] covered "Don't know what to tell ya" with French and Arabic lyrics and entitled it "Le prix pour t' aimer (Habibi Maareft Ach'n Oullek)" in her latest album "Encore et Encore". |
| 548 | +The Gossip - (covered are you that somebody) as a tribute to Aaliyah |
| 549 | + |
| 550 | +Others include tracks by ''[[DMX]], [[Yolanda Adams]], [[Tyrese]], [[R. Kelly]], [[TLC (band)|TLC]], [[Timbaland]] & Outsiderz 4 Life''.{{Fact|date=March 2007}} |
| 551 | + |
| 552 | +* [[Cooper C.]]: "Why...?" - "Imma rock da boat, Aaliyah, and be wit you. One day, hopefully I will see you too." |
| 553 | + |
| 554 | +===Unfinished Films=== |
| 555 | +Aaliyah was to have had a supporting role as Zee, the wife of [[Harold Perrineau Jr.]]'s character, Link, in the two sequels to ''[[The Matrix]]''. The directors initially tried to find a way to incorporate her footage into the movies but decided against it due to lack of material available. The role was recast with [[Nona Gaye]] playing the character. Other films in which Aaliyah was signed to star in were ''[[Honey (2003 film)|Honey]]'' (which instead was filmed with [[Jessica Alba]] as the main character), and a [[Whitney Houston]]-produced remake of the 1976 film ''[[Sparkle (1976 film)|Sparkle]]'' (now in the works with [[Raven-Symoné]] as the main character). |
| 556 | + |
| 557 | +In addition, Aaliyah and one of her agents had pitched and signed a deal with Fox Searchlight Pictures for her to star in a film based upon a true story about interracial love. |
| 558 | + |
| 559 | +==''I Care 4 U'' (2002)== |
| 560 | +[[Image:aaliyah icare4u.jpg|right|thumb|200 px|Cover of ''[[I Care 4 U]]''.]] |
| 561 | +''I Care 4 U'' was released by Blackground Records on December 10, 2002. Along with her hit singles, a number of previously unreleased tracks were included on the album, including "[[Erica Kane]]", "Don't Worry" and "All I Need" and the new singles "Miss You", "Don't Know What to Tell Ya", and "Come Over." ''I Care 4 U'' debuted at an impressive #3 on the Billboard 200 and #1 on the R&B album charts (where it remained for 7 weeks). The album went on to sell over 2.6 million in the U.S. and 5 million worldwide. |
| 562 | + |
| 563 | +The video for "Miss You" features [[Missy Elliott]], [[Toni Braxton]], [[Lil' Kim]], [[Dallas Austin]], MTV presenter and close friend [[Ananda Lewis]], actor/singer [[Jamie Foxx]], [[AJ Calloway]], [[Free (rapper)|Free]], [[Quddus (MTV)|Quddus]], Missy's recent protegé and longtime friend [[Tweet]], [[U-God]] (of the ''[[Wu-Tang Clan]]'') and [[DMX (rapper)|DMX]], [[Rosario Dawson]], among others, paying tribute to Aaliyah. Following her death, her single "[[Miss You (2003 song)|Miss You]]" made it to #1 on the [[Billboard Hot 100|US R&B Charts]]. The album earned Aaliyah a [[posthumous]] [[Grammy]] for [[Best Instrumental Arrangment Accompanying Vocals]] |
| 564 | + |
| 565 | +The follow-up single to "Miss You" was "[[Don't Know What to Tell Ya]]". However it was only released in Europe and peaked at #22 in the UK and #57 in Germany. The "Handcuff Remix" became popular among fans who had bought the single. The third and final Single released (second in the U.S.) was [[Come Over (Aaliyah song)|Come Over]]. The single had moderate pop success peaking in the top 40 of The Hot 100 at #32. It did a lot better on the R&B charts becoming a top 10 hit peaking at #9. |
| 566 | + |
| 567 | +Shortly after, the "Greatest Hits : Special Fan Box" [http://www.amazon.co.uk/exec/obidos/ASIN/B0001GYH2A/ref=ord_cart_shr/202-2194674-4915813] was released. It featured re-packaged versions of the albums "One In A Million", "Aaliyah" and "I Care 4 U". It also featured a DVD containing all of Aaliyah's music videos. It was all packaged in a special box. |
| 568 | + |
| 569 | +==Aaliyah in the mid-2000s== |
| 570 | +In early/mid-2005, four previously unreleased Aaliyah tracks were leaked to the Internet: a cover of [[Gladys Knight & the Pips|Gladys Knight & the Pips']] "Giving Up", "Where Could He Be" featuring Missy Elliot and Tweet (which was sent to radio stations), "Steady Ground" featuring Static from Playa, and a duet with Digital Black from Playa entitled "Don't Think They Know". In January 2006, a new unreleased Aaliyah track was leaked to the Internet. Entitled "Time", it was a snippet of an unfinished song and was produced by Timbaland (Sample of this track can be found on YouTube) Buzz of a song titled "Girlfriends" has been brewing for years now since the death of Aaliyah, until recently [[Yaushameen Michael]] posted the song on her Myspace, a Duet with the late R&B Princess. There are also many other rumored unreleased tracks such as "Did You Hear", "Dont Think They Know" feat. Digital Black, "Forever in My Heart", and "Candy". |
| 571 | + |
| 572 | +==Merchandise and the Aaliyah Charity Fund== |
| 573 | +Aaliyah's official website features items such as t-shirts with Aaliyah's name on them. She has had a calendar with her pictures since 2002. In 2007, Aaliyah's mother Diane Haughton and former manager Paul Allcata hired branding and licensing agency Wicked Cow Entertainment to grow the Aaliyah licensing program. Plans are currently underway for an apparel and accessories line.<ref>{{cite|The Licensing Letter|epmcom.com|title=Properties Available for Licensing|publisher=EPM|author=The Licensing Letter|date=2007-04-12}}</ref> |
| 574 | + |
| 575 | +==Discography== |
| 576 | +{{further|[[Aaliyah discography]]}} |
| 577 | + |
| 578 | +===Albums=== |
| 579 | +{| class="wikitable" |
| 580 | +! width=100| Year |
| 581 | +! width=200| Album Title |
| 582 | +! width=100| U.S. |
| 583 | +! width=100| Worldwide |
| 584 | +! width=100| U.S. Charts |
| 585 | +|- align="center" |
| 586 | +| 1994 || ''[[Age Ain't Nothing but a Number]]'' || 3.5 million || 7 million || 18 |
| 587 | +|- align="center" |
| 588 | +| 1996 || ''[[One in a Million (album)|One in a Million]]'' || 3.7 million || 11 million {{fact}} || 18 |
| 589 | +|- align="center" |
| 590 | +| 2001 || ''[[Aaliyah (album)|Aaliyah]]'' || 2.7 million || 8 million || 1 |
| 591 | +|- align="center" |
| 592 | +| 2002 || ''[[I Care 4 U]]'' || 1.5 million || 6 million || 3 |
| 593 | +|- align="center" |
| 594 | +| 2005 || ''[[Ultimate Aaliyah]]'' || 0.2 million || 2 million{{fact}} || - |
| 595 | +|- |
| 596 | +! colspan=2 | Total || 11.6 million || 34 million || |
| 597 | +|} |
| 598 | + |
| 599 | +=== Number-one singles === |
| 600 | +<!-- If it doesn't enter the singles chart, airplay charts are allowed to be putted. --> |
| 601 | + |
| 602 | +{{dablink|The following singles reached number one in the [[United States|U.S.]], the [[Hot R&B/Hip-Hop Songs|U.S. R&B]], the [[United Kingdom]] and the [[New Zealand]]. It also includes its peak in the [[United World Chart]]}} |
| 603 | +{| class="wikitable" |
| 604 | +|- bgcolor="#CCCCCC" |
| 605 | +!align="center" rowspan="2" | Year |
| 606 | +!align="center" rowspan="2" | Single |
| 607 | +!align="center" colspan="8" | Peak positions |
| 608 | +|- bgcolor="#FFFFFF" |
| 609 | +! width="60"|<small>US</small> |
| 610 | +! width="60"|<small>US R&B</small> |
| 611 | +! width="60"|<small>UK</small> |
| 612 | +! width="60"|<small>NZ</small> |
| 613 | +! width="60"|<small>United World Chart</small> |
| 614 | +|- |
| 615 | +|align="center" rowspan="1"|1994 |
| 616 | +|align="left"|"[[Back and Forth (song)|Back and Forth]]" |
| 617 | +|align="center"|5 |
| 618 | +|align="center"|'''1''' |
| 619 | +|align="center"|16 |
| 620 | +|align="center"|18 |
| 621 | +|align="center"|- |
| 622 | +|- |
| 623 | +|align="center" rowspan="2"|1996 |
| 624 | +|align="left"|"[[If Your Girl Only Knew]]" |
| 625 | +|align="center"|11 |
| 626 | +|align="center"|'''1''' |
| 627 | +|align="center"|15 |
| 628 | +|align="center"|- |
| 629 | +|align="center"|- |
| 630 | +|- |
| 631 | +|align="left"|"[[One in a Million (Aaliyah song)|One in a Million]]" |
| 632 | +|align="center"|25 |
| 633 | +|align="center"|'''1'''<sup>1<sup> |
| 634 | +|align="center"|15 |
| 635 | +|align="center"|- |
| 636 | +|align="center"|- |
| 637 | +|- |
| 638 | +|align="center" rowspan="1"|1998 |
| 639 | +|align="left"|"[[Are You That Somebody?]]" |
| 640 | +|align="center"|10 |
| 641 | +|align="center"|'''1'''<sup>1<sup> |
| 642 | +|align="center"|11 |
| 643 | +|align="center"|'''1''' |
| 644 | +|align="center"|- |
| 645 | +|- |
| 646 | +|align="center" rowspan="1"|2000 |
| 647 | +|align="left"|"[[Try Again]]" |
| 648 | +|align="center"|'''1''' |
| 649 | +|align="center"|4 |
| 650 | +|align="center"|5 |
| 651 | +|align="center"|13 |
| 652 | +|align="center"|4 |
| 653 | +|- |
| 654 | +|align="center" rowspan="1"|2002 |
| 655 | +|align="left"|"[[More Than a Woman (Aaliyah song)|More Than a Woman]]" |
| 656 | +|align="center"|25 |
| 657 | +|align="center"|7 |
| 658 | +|align="center"|'''1''' |
| 659 | +|align="center"|- |
| 660 | +|align="center"|37 |
| 661 | +|- |
| 662 | +|align="center" rowspan="1"|2003 |
| 663 | +|align="left"|"[[Miss You (Aaliyah song)|Miss You]]" |
| 664 | +|align="center"|3 |
| 665 | +|align="center"|'''1''' |
| 666 | +|align="center"|7 |
| 667 | +|align="center"|- |
| 668 | +|align="center"|29 |
| 669 | +|- |
| 670 | +|align="center"| |
| 671 | +!align="center"|Total number-one singles |
| 672 | +|align="center"|'''1''' |
| 673 | +|align="center"|'''5''' |
| 674 | +|align="center"|'''1''' |
| 675 | +|align="center"|'''1''' |
| 676 | +|align="center"|- |
| 677 | +|} |
| 678 | + |
| 679 | +*Notes: |
| 680 | +<sup>1</sup> *Topped in the [[Hot R&B/Hip-Hop Airplay]] |
| 681 | + |
| 682 | +==Awards== |
| 683 | +This is a list of awards for which Aaliyah was nominated during her career. |
| 684 | + |
| 685 | +===1995=== |
| 686 | +*1995 Nominated for an American Music Award for Favorite Soul/R&B New Artist |
| 687 | + |
| 688 | +*1995 Best R&B Female Vocal Performance for ''At Your Best'': Nominated |
| 689 | + |
| 690 | +* Nominated for Two MTV VMA's: ''Best New Artist in Video'' and '' Best R&B Video '' both for At Your Best |
| 691 | + |
| 692 | +* Nominated for three World Music Award: '' Worlds Best Selling Female Artist'', Worlds Best Selling New Artist'' and'' Worlds Best Selling R&B Artist'' |
| 693 | + |
| 694 | +* Nominated for eight Billboard Music Awards: Best New R&B Artist, Best New Artist, Female Artist of the Year, Best R&B/Hip Hop single, Best Female R&B Single, Best Female Pop Single, Best Hip Hip/R&B Artist, Best R&B/Hip Hop Album |
| 695 | + |
| 696 | +* Source Awards: Best New Artist and Best Female R&B Artist '''WON''' |
| 697 | + |
| 698 | +===1999=== |
| 699 | +* 1999 Nominated for two MTV Video Music Awards: R&B Video ("Are You That Somebody?"), Best Video from a Film ("Are You That Somebody") |
| 700 | + |
| 701 | +* 1999 Nominated for an American Music Award for Favorite Soul/R&B Female Artist. |
| 702 | + |
| 703 | +* 1999 Nominated for an NAACP Image Award for Outstanding Music Video ("Are You That Somebody?") |
| 704 | + |
| 705 | +* 1999 Nominated for two Soul Train Lady of Soul Awards for Best R&B/Soul Song and Best R&B/Soul or Rap Music Video ("Are You That Somebody?"). |
| 706 | + |
| 707 | +*1999 Nominated for a Grammy Award for Best Female R&B Vocal Performance ("Are You That Someobdy?") |
| 708 | + |
| 709 | +*1999 Nominated for a [[Academy Award]] Best Original Song for: Journey To The Past |
| 710 | + |
| 711 | +===2000=== |
| 712 | +*2000 Nominated for two Soul Train Lady of Soul Awards for Best RnB / Soul Single - Solo and Best RnB / Soul or Rap Music Video ("Try Again") |
| 713 | +* 2000 '''Won''' two MTV Video Music Awards for Best Female Video, Best Video from a Film ("Try Again") |
| 714 | +* 2000 Nominated for MTV Europe Music Video Award for Best RnB video |
| 715 | +*2000 Nominated for Radio Music Award Urban song of the year and Urban artist of the year. |
| 716 | +*2000 Nominated My VH1 music award nominee for double threat (Musicians-Actors) award. |
| 717 | + |
| 718 | +===2001=== |
| 719 | +*2001 Nominated for a Grammy Award for Best Female R&B Vocal Performance ("Try Again") |
| 720 | + |
| 721 | +===2002=== |
| 722 | +*2002 '''Won''' two American Music Awards: Favorite Soul/R&B Female Artist and Favorite Soul/R&B Album. |
| 723 | +*2002 Nominated for two Grammy Awards for Best Female R&B Vocal Performance ("Rock The Boat") and Best R&B Album ("Aaliyah") |
| 724 | +*2002 '''Won''' a Soul Train Award for R&B/Soul Single; Female ("Rock The Boat") |
| 725 | +*2002 '''Won''' the Best R&B / Soul Single, Solo Award and R&B/Soul or Rap Song of the Year at the Soul Train Lady of Soul Awards (for "Rock The Boat") |
| 726 | +*2002 Nominated for an MTV Video Music Award for Best R&B Video ("Rock The Boat") |
| 727 | +===2003=== |
| 728 | +*2003 '''Won''' Source Awards: Best Female R&B Artist ''I Care 4 U'' |
| 729 | + |
| 730 | +==Filmography== |
| 731 | +*''[[Romeo Must Die]]'' (2000) - Trish O'Day |
| 732 | +*''[[Queen of the Damned (film)|Queen of the Damned]]'' (2002) - Akasha |
| 733 | + |
| 734 | +===Unfinished films=== |
| 735 | + |
| 736 | +*''[[The Matrix Reloaded]]'' (2003) (replaced by [[Nona Gaye]]) |
| 737 | +*''[[The Matrix Revolutions]]'' (2003) (replaced by [[Nona Gaye]]) |
| 738 | +*''[[Honey (2003 film)|Honey]]'' (2003) (replaced by [[Jessica Alba]]) |
| 739 | +*''[[Sparkle]]'' (replaced by [[Raven-Symoné]]) |
| 740 | + |
| 741 | +Aaliyah was [http://www.notstarring.com/actors/aaliyah considered to play a part in other films], including: |
| 742 | + |
| 743 | +*''[[Charlie's Angels (film)|Charlie's Angels]]'' (2000) |
| 744 | +*''[[Get Over It (film)|Get Over It]]'' (2001) |
| 745 | +*''[[Josie and the Pussycats (film)|Josie and the Pussycats]]'' (2001) |
| 746 | +*''[[Osmosis Jones]]'' (2001) |
| 747 | + |
| 748 | +==See also== |
| 749 | +*[[Blackground Records]] |
| 750 | +*[[Missy Elliott]] |
| 751 | +*[[Steve "Static" Garrett]] |
| 752 | +*[[Rashad Haughton]] |
| 753 | +*[[Swing Mob]] |
| 754 | +*[[Timbaland]] |
| 755 | + |
| 756 | +==References== |
| 757 | +{{reflist|2}} |
| 758 | + |
| 759 | +==Links== |
| 760 | +*[http://www.Aaliyah.com Official site] |
| 761 | +*[http://www.aaliyahicare4u.com Artist Website] |
| 762 | +*[http://www.myspace.com/aaliyah Aaliyah's Official Myspace Page] |
| 763 | +*{{imdb name|id=0004691|name=Aaliyah}} |
| 764 | +*{{nndb name|id=742/000024670|name=Aaliyah}} |
| 765 | +*[http://www.billboard.com/bbcom/bio/index.jsp?pid=36610&cr=artist&or=ASCENDING&sf=length&kw=aaliyah Aaliyah bio on Billboard] |
| 766 | +*[http://www.hibblenradio.com/2001-08-CBS-AaliyahCrash.mp3 MP3 of CBS News, Radio reports on crash from Abaco Island] |
| 767 | + |
| 768 | +<!-- PLEASE DO NOT ADD FANSITES --> |
| 769 | + |
| 770 | +{{Aaliyah}} |
| 771 | + |
| 772 | +[[Category:Aaliyah| ]] |
| 773 | +[[Category:American actor-singers]] |
| 774 | +[[Category:American pop singers]] |
| 775 | +[[Category:American dance musicians]] |
| 776 | +[[Category:American Roman Catholics]] |
| 777 | +[[Category:American rhythm and blues singers]] |
| 778 | +[[Category:American female singers]] |
| 779 | +[[Category:American film actors]] |
| 780 | +[[Category:American dancers]] |
| 781 | +[[Category:African-American singers]] |
| 782 | +[[Category:African-American actors]] |
| 783 | +[[Category:Michigan musicians]] |
| 784 | +[[Category:Swing Mob artists]] |
| 785 | +[[Category:People from Brooklyn]] |
| 786 | +[[Category:People from Detroit]] |
| 787 | +[[Category:Plane crash victims]] |
| 788 | +[[Category:1979 births]] |
| 789 | +[[Category:2001 deaths]] |
| 790 | + |
| 791 | +[[cs:Aaliyah]] |
| 792 | +[[de:Aaliyah]] |
| 793 | +[[et:Aaliyah]] |
| 794 | +[[es:Aaliyah]] |
| 795 | +[[fa:عالیه]] |
| 796 | +[[fr:Aaliyah]] |
| 797 | +[[gl:Aaliyah]] |
| 798 | +[[it:Aaliyah]] |
| 799 | +[[lt:Aaliyah]] |
| 800 | +[[ms:Aaliyah]] |
| 801 | +[[nl:Aaliyah]] |
| 802 | +[[ja:アリーヤ]] |
| 803 | +[[no:Aaliyah]] |
| 804 | +[[pl:Aaliyah]] |
| 805 | +[[pt:Aaliyah]] |
| 806 | +[[ru:Аалия]] |
| 807 | +[[simple:Aaliyah]] |
| 808 | +[[fi:Aaliyah]] |
| 809 | +[[sv:Aaliyah]] |
| 810 | +[[tl:Aaliyah]] |
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/CustomBoostQuery.java |
— | — | @@ -0,0 +1,351 @@ |
| 2 | +package org.apache.lucene.search; |
| 3 | + |
| 4 | +import org.apache.lucene.search.*; |
| 5 | + |
| 6 | +/** |
| 7 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 8 | + * contributor license agreements. See the NOTICE file distributed with |
| 9 | + * this work for additional information regarding copyright ownership. |
| 10 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 11 | + * (the "License"); you may not use this file except in compliance with |
| 12 | + * the License. You may obtain a copy of the License at |
| 13 | + * |
| 14 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 15 | + * |
| 16 | + * Unless required by applicable law or agreed to in writing, software |
| 17 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 18 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 19 | + * See the License for the specific language governing permissions and |
| 20 | + * limitations under the License. |
| 21 | + */ |
| 22 | + |
| 23 | +import java.io.IOException; |
| 24 | +import java.util.Set; |
| 25 | + |
| 26 | +import org.apache.lucene.index.IndexReader; |
| 27 | +import org.apache.lucene.search.ComplexExplanation; |
| 28 | +import org.apache.lucene.search.Explanation; |
| 29 | +import org.apache.lucene.search.Query; |
| 30 | +import org.apache.lucene.search.Scorer; |
| 31 | +import org.apache.lucene.search.Searcher; |
| 32 | +import org.apache.lucene.search.Similarity; |
| 33 | +import org.apache.lucene.search.Weight; |
| 34 | +import org.apache.lucene.util.ToStringUtils; |
| 35 | + |
| 36 | +/** |
| 37 | + * Query that sets document score as a programmatic function of (up to) two (sub) scores. |
| 38 | + * <ol> |
| 39 | + * <li>the score of its subQuery (any query)</li> |
| 40 | + * <li>(optional) the score of its boosting Query, |
| 41 | + * for most simple/convineient use case this query would be a |
| 42 | + * {@link org.apache.lucene.search.function.FieldScoreQuery FieldScoreQuery}</li> |
| 43 | + * </ol> |
| 44 | + * Subclasses can modify the computation by overriding {@link #customScore(int, float, float)}. |
| 45 | + * |
| 46 | + * Note: documents will only match based on the first sub scorer. |
| 47 | + * |
| 48 | + * <p><font color="#FF0000"> |
| 49 | + * WARNING: The status of the <b>search.function</b> package is experimental. |
| 50 | + * The APIs introduced here might change in the future and will not be |
| 51 | + * supported anymore in such a case.</font> |
| 52 | + */ |
| 53 | +public class CustomBoostQuery extends Query { |
| 54 | + |
| 55 | + private Query subQuery; |
| 56 | + private Query boostQuery; // optional, can be null |
| 57 | + private boolean strict = false; // if true, boosting part of query does not take part in weights normalization. |
| 58 | + |
| 59 | + /** |
| 60 | + * Create a CustomBoostQuery over input subQuery. |
| 61 | + * @param subQuery the sub query whose scored is being customed. Must not be null. |
| 62 | + */ |
| 63 | + public CustomBoostQuery(Query subQuery) { |
| 64 | + this(subQuery,null); |
| 65 | + } |
| 66 | + |
| 67 | + /** |
| 68 | + * Create a CustomBoostQuery over input subQuery and a {@link Query}. |
| 69 | + * @param subQuery the sub query whose score is being customed. Must not be null. |
| 70 | + * @param boostQuery a value source query whose scores are used in the custom score |
| 71 | + * computation. For most simple/convineient use case this would be a |
| 72 | + * {@link org.apache.lucene.search.function.FieldScoreQuery FieldScoreQuery}. |
| 73 | + * This parameter is optional - it can be null. |
| 74 | + */ |
| 75 | + public CustomBoostQuery(Query subQuery, Query boostQuery) { |
| 76 | + super(); |
| 77 | + this.subQuery = subQuery; |
| 78 | + this.boostQuery = boostQuery; |
| 79 | + if (subQuery == null) throw new IllegalArgumentException("<subqyery> must not be null!"); |
| 80 | + } |
| 81 | + |
| 82 | + /*(non-Javadoc) @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) */ |
| 83 | + public Query rewrite(IndexReader reader) throws IOException { |
| 84 | + subQuery = subQuery.rewrite(reader); |
| 85 | + if (boostQuery!=null) { |
| 86 | + boostQuery = (Query) boostQuery.rewrite(reader); |
| 87 | + } |
| 88 | + return this; |
| 89 | + } |
| 90 | + |
| 91 | + /*(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(java.util.Set) */ |
| 92 | + public void extractTerms(Set terms) { |
| 93 | + subQuery.extractTerms(terms); |
| 94 | + if (boostQuery!=null) { |
| 95 | + boostQuery.extractTerms(terms); |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | + /*(non-Javadoc) @see org.apache.lucene.search.Query#clone() */ |
| 100 | + public Object clone() { |
| 101 | + CustomBoostQuery clone = (CustomBoostQuery)super.clone(); |
| 102 | + clone.subQuery = (Query) subQuery.clone(); |
| 103 | + if (boostQuery!=null) { |
| 104 | + clone.boostQuery = (Query) boostQuery.clone(); |
| 105 | + } |
| 106 | + return clone; |
| 107 | + } |
| 108 | + |
| 109 | + /* (non-Javadoc) @see org.apache.lucene.search.Query#toString(java.lang.String) */ |
| 110 | + public String toString(String field) { |
| 111 | + StringBuffer sb = new StringBuffer(name()).append("("); |
| 112 | + sb.append(subQuery.toString(field)); |
| 113 | + if (boostQuery!=null) { |
| 114 | + sb.append(", ").append(boostQuery.toString(field)); |
| 115 | + } |
| 116 | + sb.append(")"); |
| 117 | + sb.append(strict?" STRICT" : ""); |
| 118 | + return sb.toString() + ToStringUtils.boost(getBoost()); |
| 119 | + } |
| 120 | + |
| 121 | + /** Returns true if <code>o</code> is equal to this. */ |
| 122 | + public boolean equals(Object o) { |
| 123 | + if (getClass() != o.getClass()) { |
| 124 | + return false; |
| 125 | + } |
| 126 | + CustomBoostQuery other = (CustomBoostQuery)o; |
| 127 | + return this.getBoost() == other.getBoost() |
| 128 | + && this.subQuery.equals(other.subQuery) |
| 129 | + && (this.boostQuery==null ? other.boostQuery==null |
| 130 | + : this.boostQuery.equals(other.boostQuery)); |
| 131 | + } |
| 132 | + |
| 133 | + /** Returns a hash code value for this object. */ |
| 134 | + public int hashCode() { |
| 135 | + int boostHash = boostQuery==null ? 0 : boostQuery.hashCode(); |
| 136 | + return (getClass().hashCode() + subQuery.hashCode() + boostHash) ^ Float.floatToIntBits(getBoost()); |
| 137 | + } |
| 138 | + |
| 139 | + /** |
| 140 | + * Compute a custom score by the subQuery score and the Query score. |
| 141 | + * <p> |
| 142 | + * Subclasses can override this method to modify the custom score. |
| 143 | + * <p> |
| 144 | + * The default computation herein is: |
| 145 | + * <pre> |
| 146 | + * ModifiedScore = boostScore * subQueryScore. |
| 147 | + * </pre> |
| 148 | + * |
| 149 | + * @param doc id of scored doc. |
| 150 | + * @param subQueryScore score of that doc by the subQuery. |
| 151 | + * @param boostScore score of that doc by the Query. |
| 152 | + * @return custom score. |
| 153 | + */ |
| 154 | + public float customScore(int doc, float subQueryScore, float boostScore) { |
| 155 | + return (0.2f + boostScore * 10) * subQueryScore; |
| 156 | + } |
| 157 | + |
| 158 | + /** |
| 159 | + * Explain the custom score. |
| 160 | + * Whenever overriding {@link #customScore(int, float, float)}, |
| 161 | + * this method should also be overriden to provide the correct explanation |
| 162 | + * for the part of the custom scoring. |
| 163 | + * @param doc doc being explained. |
| 164 | + * @param subQueryExpl explanation for the sub-query part. |
| 165 | + * @param boostExpl explanation for the value source part. |
| 166 | + * @return an explanation for the custom score |
| 167 | + */ |
| 168 | + public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation boostExpl) { |
| 169 | + float boostScore = boostExpl==null ? 1 : boostExpl.getValue(); |
| 170 | + float sc = (0.2f + boostScore * 10); |
| 171 | + Explanation exp = new Explanation( sc * subQueryExpl.getValue(), "custom score: product of:"); |
| 172 | + exp.addDetail(subQueryExpl); |
| 173 | + if (boostExpl != null) { |
| 174 | + exp.addDetail(boostExpl); |
| 175 | + } |
| 176 | + return exp; |
| 177 | + } |
| 178 | + //=========================== W E I G H T ============================ |
| 179 | + |
| 180 | + private class CustomWeight implements Weight { |
| 181 | + Searcher searcher; |
| 182 | + Weight subQueryWeight; |
| 183 | + Weight boostWeight; // optional |
| 184 | + boolean qStrict; |
| 185 | + |
| 186 | + public CustomWeight(Searcher searcher) throws IOException { |
| 187 | + this.searcher = searcher; |
| 188 | + this.subQueryWeight = subQuery.weight(searcher); |
| 189 | + if (boostQuery!=null) { |
| 190 | + this.boostWeight = boostQuery.createWeight(searcher); |
| 191 | + } |
| 192 | + this.qStrict = strict; |
| 193 | + } |
| 194 | + |
| 195 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#getQuery() */ |
| 196 | + public Query getQuery() { |
| 197 | + return CustomBoostQuery.this; |
| 198 | + } |
| 199 | + |
| 200 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() */ |
| 201 | + public float getValue() { |
| 202 | + return getBoost(); |
| 203 | + } |
| 204 | + |
| 205 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() */ |
| 206 | + public float sumOfSquaredWeights() throws IOException { |
| 207 | + float sum = subQueryWeight.sumOfSquaredWeights(); |
| 208 | + if (boostWeight!=null) { |
| 209 | + if (qStrict) { |
| 210 | + boostWeight.sumOfSquaredWeights(); // do not include ValueSource part in the query normalization |
| 211 | + } else { |
| 212 | + sum += boostWeight.sumOfSquaredWeights(); |
| 213 | + } |
| 214 | + } |
| 215 | + sum *= getBoost() * getBoost(); // boost each sub-weight |
| 216 | + return sum ; |
| 217 | + } |
| 218 | + |
| 219 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) */ |
| 220 | + public void normalize(float norm) { |
| 221 | + norm *= getBoost(); // incorporate boost |
| 222 | + subQueryWeight.normalize(norm); |
| 223 | + if (boostWeight!=null) { |
| 224 | + if (qStrict) { |
| 225 | + boostWeight.normalize(1); // do not normalize the ValueSource part |
| 226 | + } else { |
| 227 | + boostWeight.normalize(norm); |
| 228 | + } |
| 229 | + } |
| 230 | + } |
| 231 | + |
| 232 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#scorer(org.apache.lucene.index.IndexReader) */ |
| 233 | + public Scorer scorer(IndexReader reader) throws IOException { |
| 234 | + Scorer subQueryScorer = subQueryWeight.scorer(reader); |
| 235 | + Scorer boostScorer = (boostWeight==null ? null : boostWeight.scorer(reader)); |
| 236 | + return new CustomScorer(getSimilarity(searcher), reader, this, subQueryScorer, boostScorer); |
| 237 | + } |
| 238 | + |
| 239 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#explain(org.apache.lucene.index.IndexReader, int) */ |
| 240 | + public Explanation explain(IndexReader reader, int doc) throws IOException { |
| 241 | + return scorer(reader).explain(doc); |
| 242 | + } |
| 243 | + } |
| 244 | + |
| 245 | + |
| 246 | + //=========================== S C O R E R ============================ |
| 247 | + |
| 248 | + /** |
| 249 | + * A scorer that applies a (callback) function on scores of the subQuery. |
| 250 | + */ |
| 251 | + private class CustomScorer extends Scorer { |
| 252 | + private final CustomWeight weight; |
| 253 | + private final float qWeight; |
| 254 | + private Scorer subQueryScorer; |
| 255 | + private Scorer boostScorer; // optional |
| 256 | + private IndexReader reader; |
| 257 | + |
| 258 | + // constructor |
| 259 | + private CustomScorer(Similarity similarity, IndexReader reader, CustomWeight w, |
| 260 | + Scorer subQueryScorer, Scorer boostScorer) throws IOException { |
| 261 | + super(similarity); |
| 262 | + this.weight = w; |
| 263 | + this.qWeight = w.getValue(); |
| 264 | + this.subQueryScorer = subQueryScorer; |
| 265 | + this.boostScorer = boostScorer; |
| 266 | + this.reader = reader; |
| 267 | + } |
| 268 | + |
| 269 | + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#next() */ |
| 270 | + public boolean next() throws IOException { |
| 271 | + boolean hasNext = subQueryScorer.next(); |
| 272 | + if (boostScorer!=null && hasNext) { |
| 273 | + boostScorer.skipTo(subQueryScorer.doc()); |
| 274 | + } |
| 275 | + return hasNext; |
| 276 | + } |
| 277 | + |
| 278 | + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#doc() */ |
| 279 | + public int doc() { |
| 280 | + return subQueryScorer.doc(); |
| 281 | + } |
| 282 | + |
| 283 | + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#score() */ |
| 284 | + public float score() throws IOException { |
| 285 | + float boostScore = (boostScorer==null || subQueryScorer.doc() != boostScorer.doc() ? 0 : boostScorer.score()); |
| 286 | + return qWeight * customScore(subQueryScorer.doc(), subQueryScorer.score(), boostScore); |
| 287 | + } |
| 288 | + |
| 289 | + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#skipTo(int) */ |
| 290 | + public boolean skipTo(int target) throws IOException { |
| 291 | + boolean hasNext = subQueryScorer.skipTo(target); |
| 292 | + if (boostScorer!=null && hasNext) { |
| 293 | + boostScorer.skipTo(subQueryScorer.doc()); |
| 294 | + } |
| 295 | + return hasNext; |
| 296 | + } |
| 297 | + |
| 298 | + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#explain(int) */ |
| 299 | + public Explanation explain(int doc) throws IOException { |
| 300 | + Explanation subQueryExpl = weight.subQueryWeight.explain(reader,doc); |
| 301 | + if (!subQueryExpl.isMatch()) { |
| 302 | + return subQueryExpl; |
| 303 | + } |
| 304 | + // match |
| 305 | + Explanation boostExpl = boostScorer==null ? null : |
| 306 | + weight.qStrict ? boostScorer.explain(doc) : weight.boostWeight.explain(reader,doc); |
| 307 | + Explanation customExp = customExplain(doc,subQueryExpl,boostExpl); |
| 308 | + float sc = qWeight * customExp.getValue(); |
| 309 | + Explanation res = new ComplexExplanation( |
| 310 | + true, sc, CustomBoostQuery.this.toString() + ", product of:"); |
| 311 | + res.addDetail(customExp); |
| 312 | + res.addDetail(new Explanation(qWeight, "queryBoost")); // actually using the q boost as q weight (== weight value) |
| 313 | + return res; |
| 314 | + } |
| 315 | + } |
| 316 | + |
| 317 | + /*(non-Javadoc) @see org.apache.lucene.search.Query#createWeight(org.apache.lucene.search.Searcher) */ |
| 318 | + protected Weight createWeight(Searcher searcher) throws IOException { |
| 319 | + return new CustomWeight(searcher); |
| 320 | + } |
| 321 | + |
| 322 | + /** |
| 323 | + * Checks if this is strict custom scoring. |
| 324 | + * In strict custom scoring, the ValueSource part of does not participate in weight normalization. |
| 325 | + * This may be useful when one wants full control over how scores are modified, and does |
| 326 | + * not care about normalizing by the ValueSource part. |
| 327 | + * One particular case where this is useful if for testing this query. |
| 328 | + * <P> |
| 329 | + * Note: only has effect when the ValueSource part is not null. |
| 330 | + */ |
| 331 | + public boolean isStrict() { |
| 332 | + return strict; |
| 333 | + } |
| 334 | + |
| 335 | + /** |
| 336 | + * Set the strict mode of this query. |
| 337 | + * @param strict The strict mode to set. |
| 338 | + * @see #isStrict() |
| 339 | + */ |
| 340 | + public void setStrict(boolean strict) { |
| 341 | + this.strict = strict; |
| 342 | + } |
| 343 | + |
| 344 | + /** |
| 345 | + * A short name of this query, used in {@link #toString(String)}. |
| 346 | + */ |
| 347 | + public String name() { |
| 348 | + return "custom"; |
| 349 | + } |
| 350 | + |
| 351 | +} |
| 352 | + |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/ResultSet.java |
— | — | @@ -1,6 +1,8 @@ |
2 | 2 | package org.wikimedia.lsearch.beans; |
3 | 3 | |
4 | 4 | import java.io.Serializable; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.Collection; |
5 | 7 | |
6 | 8 | import org.apache.lucene.search.Explanation; |
7 | 9 | |
— | — | @@ -9,6 +11,7 @@ |
10 | 12 | public double score; |
11 | 13 | public String namespace; |
12 | 14 | public String title; |
| 15 | + public ArrayList<String> context; |
13 | 16 | Explanation explanation; |
14 | 17 | |
15 | 18 | public ResultSet(String key) { |
— | — | @@ -43,7 +46,25 @@ |
44 | 47 | @Override |
45 | 48 | public String toString() { |
46 | 49 | return score+" "+namespace+":"+title+(explanation==null? "" : "\n"+explanation); |
47 | | - } |
| 50 | + } |
48 | 51 | |
| 52 | + public void addContext(Collection<String> texts){ |
| 53 | + if(texts == null) |
| 54 | + return; |
| 55 | + for(String t : texts) |
| 56 | + addContext(t); |
| 57 | + } |
49 | 58 | |
| 59 | + public void addContext(String text){ |
| 60 | + if(context == null) |
| 61 | + context = new ArrayList<String>(); |
| 62 | + |
| 63 | + context.add(text.replace('\n',' ')); |
| 64 | + } |
| 65 | + |
| 66 | + public ArrayList<String> getContext(){ |
| 67 | + return context; |
| 68 | + } |
| 69 | + |
| 70 | + |
50 | 71 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Title.java |
— | — | @@ -17,11 +17,11 @@ |
18 | 18 | } |
19 | 19 | |
20 | 20 | public Title(String key){ |
21 | | - String[] parts = key.split(":",2); |
22 | | - if(parts.length != 2) |
| 21 | + int col = key.indexOf(':'); |
| 22 | + if(col == -1) |
23 | 23 | throw new RuntimeException("Wrong key format in Title constructor"); |
24 | | - this.namespace = Integer.parseInt(parts[0]); |
25 | | - this.title = parts[1]; |
| 24 | + this.namespace = Integer.parseInt(key.substring(0,col)); |
| 25 | + this.title = key.substring(col+1); |
26 | 26 | } |
27 | 27 | |
28 | 28 | public String getKey(){ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java |
— | — | @@ -119,7 +119,7 @@ |
120 | 120 | selected.add(sorted.get(i).getKey()); |
121 | 121 | } |
122 | 122 | Document d = new Document(); |
123 | | - d.add(new Field("prefix",prefix,Field.Store.NO,Field.Index.UN_TOKENIZED)); |
| 123 | + d.add(new Field("prefix",prefix,Field.Store.NO,Field.Index.NO_NORMS)); |
124 | 124 | d.add(new Field("articles",new StringList(selected).toString(),Field.Store.YES,Field.Index.NO)); |
125 | 125 | writer.addDocument(d); |
126 | 126 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java |
— | — | @@ -38,17 +38,17 @@ |
39 | 39 | Revision revision; |
40 | 40 | SimpleIndexWriter writer; |
41 | 41 | int count = 0, limit; |
42 | | - LinkAnalysisStorage las; |
| 42 | + Links links; |
43 | 43 | String langCode; |
44 | 44 | RelatedStorage related; |
45 | 45 | |
46 | 46 | public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor, |
47 | | - Integer maxBufDocs, boolean newIndex, LinkAnalysisStorage las, String langCode){ |
| 47 | + Integer maxBufDocs, boolean newIndex, Links links, String langCode){ |
48 | 48 | Configuration.open(); // make sure configuration is loaded |
49 | 49 | IndexId iid = IndexId.get(dbname); |
50 | | - writer = new SimpleIndexWriter(iid, optimize, mergeFactor, maxBufDocs, newIndex); |
| 50 | + writer = new SimpleIndexWriter(links, iid, optimize, mergeFactor, maxBufDocs, newIndex); |
51 | 51 | this.limit = limit; |
52 | | - this.las = las; |
| 52 | + this.links = links; |
53 | 53 | this.langCode = langCode; |
54 | 54 | this.related = new RelatedStorage(iid); |
55 | 55 | if(!related.canRead()) |
— | — | @@ -62,26 +62,29 @@ |
63 | 63 | } |
64 | 64 | public void writeEndPage() throws IOException { |
65 | 65 | String key = page.Title.Namespace+":"+page.Title.Text; |
66 | | - ArticleAnalytics aa = las.getAnaliticsForArticle(key); |
67 | | - int references = aa.getReferences(); |
68 | | - boolean isRedirect = aa.isRedirect(); |
69 | | - int redirectTargetNamespace = aa.getRedirectTargetNamespace(); |
| 66 | + int references = links.getNumInLinks(key); |
| 67 | + boolean isRedirect = links.isRedirect(key); |
| 68 | + int redirectTargetNamespace = isRedirect? links.getRedirectTargetNamespace(key) : -1; |
70 | 69 | |
71 | 70 | // make list of redirects |
72 | 71 | ArrayList<Redirect> redirects = new ArrayList<Redirect>(); |
73 | 72 | ArrayList<String> anchors = new ArrayList<String>(); |
74 | | - anchors.addAll(aa.getAnchorText()); |
75 | | - for(String rk : aa.getRedirectKeys()){ |
| 73 | + //anchors.addAll(aa.getAnchorText()); |
| 74 | + for(String rk : links.getRedirectsTo(key)){ |
76 | 75 | String[] parts = rk.toString().split(":",2); |
77 | | - ArticleAnalytics raa = las.getAnaliticsForReferences(rk); |
78 | | - redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],raa.getReferences())); |
79 | | - anchors.addAll(raa.getAnchorText()); |
| 76 | + int redirectRef = links.getNumInLinks(rk); |
| 77 | + redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],redirectRef)); |
| 78 | + //anchors.addAll(raa.getAnchorText()); |
80 | 79 | } |
81 | 80 | ArrayList<RelatedTitle> rel = null; |
82 | 81 | if(related != null) |
83 | 82 | rel = related.getRelated(key); |
84 | 83 | else |
85 | 84 | rel = new ArrayList<RelatedTitle>(); |
| 85 | + // extract contexts |
| 86 | + /*for(RelatedTitle t : rel){ |
| 87 | + links.getContext(t.getRelated().getKey(),key); |
| 88 | + } */ |
86 | 89 | // make article |
87 | 90 | Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect, |
88 | 91 | references,redirectTargetNamespace,redirects,rel,anchors); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java |
— | — | @@ -17,6 +17,7 @@ |
18 | 18 | import org.wikimedia.lsearch.index.IndexUpdateRecord; |
19 | 19 | import org.wikimedia.lsearch.index.WikiIndexModifier; |
20 | 20 | import org.wikimedia.lsearch.index.WikiSimilarity; |
| 21 | +import org.wikimedia.lsearch.ranks.Links; |
21 | 22 | |
22 | 23 | /** |
23 | 24 | * IndexWriter for building indexes from scratch. |
— | — | @@ -33,8 +34,10 @@ |
34 | 35 | protected Integer mergeFactor, maxBufDocs; |
35 | 36 | protected boolean newIndex; |
36 | 37 | protected String langCode; |
| 38 | + protected Links links; |
37 | 39 | |
38 | | - public SimpleIndexWriter(IndexId iid, Boolean optimize, Integer mergeFactor, Integer maxBufDocs, boolean newIndex){ |
| 40 | + public SimpleIndexWriter(Links links, IndexId iid, Boolean optimize, Integer mergeFactor, Integer maxBufDocs, boolean newIndex){ |
| 41 | + this.links = links; |
39 | 42 | this.iid = iid; |
40 | 43 | this.optimize = optimize; |
41 | 44 | this.mergeFactor = mergeFactor; |
— | — | @@ -43,7 +46,7 @@ |
44 | 47 | GlobalConfiguration global = GlobalConfiguration.getInstance(); |
45 | 48 | langCode = global.getLanguage(iid.getDBname()); |
46 | 49 | FieldBuilder.Case dCase = (global.exactCaseIndex(iid.getDBname()))? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE; |
47 | | - builder = new FieldBuilder(langCode,dCase); |
| 50 | + builder = new FieldBuilder(iid,dCase); |
48 | 51 | indexes = new HashMap<String,IndexWriter>(); |
49 | 52 | // open all relevant indexes |
50 | 53 | if(iid.isSingle()) |
— | — | @@ -109,7 +112,7 @@ |
110 | 113 | IndexWriter writer = indexes.get(target.toString()); |
111 | 114 | if(writer == null) |
112 | 115 | return; |
113 | | - Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid); |
| 116 | + Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid,links); |
114 | 117 | Document doc = (Document) ret[0]; |
115 | 118 | Analyzer analyzer = (Analyzer) ret[1]; |
116 | 119 | try { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -20,6 +20,7 @@ |
21 | 21 | import org.wikimedia.lsearch.ranks.Links; |
22 | 22 | import org.wikimedia.lsearch.ranks.RankBuilder; |
23 | 23 | import org.wikimedia.lsearch.related.CompactLinks; |
| 24 | +import org.wikimedia.lsearch.related.RelatedBuilder; |
24 | 25 | import org.wikimedia.lsearch.storage.LinkAnalysisStorage; |
25 | 26 | import org.wikimedia.lsearch.storage.Storage; |
26 | 27 | import org.wikimedia.lsearch.util.Localization; |
— | — | @@ -45,6 +46,7 @@ |
46 | 47 | Integer mergeFactor = null, maxBufDocs = null; |
47 | 48 | boolean newIndex = true, makeSnapshot = false; |
48 | 49 | boolean snapshotDb = false, useOldLinkAnalysis = false; |
| 50 | + boolean useOldRelated = false; |
49 | 51 | |
50 | 52 | System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n"); |
51 | 53 | |
— | — | @@ -52,12 +54,13 @@ |
53 | 55 | log = Logger.getLogger(Importer.class); |
54 | 56 | |
55 | 57 | if(args.length < 2){ |
56 | | - System.out.println("Syntax: java Importer [-a] [-n] [-s] [-la] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>"); |
| 58 | + System.out.println("Syntax: java Importer [-a] [-n] [-s] [-l] [-r] [-lm limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>"); |
57 | 59 | System.out.println("Options: "); |
58 | 60 | System.out.println(" -a - don't create new index, append to old"); |
59 | 61 | System.out.println(" -s - make index snapshot when finished"); |
60 | | - System.out.println(" -la - use earlier link analysis index, don't recalculate"); |
61 | | - System.out.println(" -l limit_num - add at most limit_num articles"); |
| 62 | + System.out.println(" -l - use earlier link analysis index, don't recalculate"); |
| 63 | + System.out.println(" -r - use earlier related index, don't recalculate"); |
| 64 | + System.out.println(" -lm limit_num - add at most limit_num articles"); |
62 | 65 | System.out.println(" -o optimize - true/false overrides optimization param from global settings"); |
63 | 66 | System.out.println(" -m mergeFactor - overrides param from global settings"); |
64 | 67 | System.out.println(" -b maxBufDocs - overrides param from global settings"); |
— | — | @@ -65,7 +68,7 @@ |
66 | 69 | return; |
67 | 70 | } |
68 | 71 | for(int i=0;i<args.length;i++){ |
69 | | - if(args[i].equals("-l")) |
| 72 | + if(args[i].equals("-lm")) |
70 | 73 | limit = Integer.parseInt(args[++i]); |
71 | 74 | else if(args[i].equals("-o")) |
72 | 75 | optimize = Boolean.parseBoolean(args[++i]); |
— | — | @@ -75,8 +78,10 @@ |
76 | 79 | maxBufDocs = Integer.parseInt(args[++i]); |
77 | 80 | else if(args[i].equals("-a")) |
78 | 81 | newIndex = false; |
79 | | - else if(args[i].equals("-la")) |
| 82 | + else if(args[i].equals("-l")) |
80 | 83 | useOldLinkAnalysis = true; |
| 84 | + else if(args[i].equals("-r")) |
| 85 | + useOldRelated = true; |
81 | 86 | else if(args[i].equals("-s")) |
82 | 87 | makeSnapshot = true; |
83 | 88 | else if(args[i].equals("--snapshot")){ |
— | — | @@ -106,17 +111,23 @@ |
107 | 112 | long start = System.currentTimeMillis(); |
108 | 113 | |
109 | 114 | if(!useOldLinkAnalysis){ |
110 | | - // regenerate link and redirect information |
111 | | - Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode,iid),langCode); |
| 115 | + // regenerate link and redirect information |
112 | 116 | try { |
113 | | - RankBuilder.storeLinkAnalysis(links,iid); |
| 117 | + RankBuilder.processLinks(inputfile,Links.createNew(iid),iid,langCode); |
114 | 118 | } catch (IOException e) { |
115 | 119 | log.fatal("Cannot store link analytics: "+e.getMessage()); |
116 | 120 | return; |
117 | 121 | } |
118 | 122 | } |
119 | | - log.info("Third pass, indexing articles..."); |
120 | | - |
| 123 | + if(!useOldRelated){ |
| 124 | + try { |
| 125 | + RelatedBuilder.rebuildFromLinks(iid); |
| 126 | + } catch (IOException e) { |
| 127 | + log.fatal("Cannot make related mapping: "+e.getMessage()); |
| 128 | + return; |
| 129 | + } |
| 130 | + } |
| 131 | + |
121 | 132 | // open |
122 | 133 | InputStream input = null; |
123 | 134 | try { |
— | — | @@ -124,31 +135,29 @@ |
125 | 136 | } catch (IOException e) { |
126 | 137 | log.fatal("I/O error opening "+inputfile); |
127 | 138 | return; |
128 | | - } |
129 | | - LinkAnalysisStorage las = new LinkAnalysisStorage(iid); |
130 | | - // read |
131 | | - DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,las,langCode); |
132 | | - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 1000)); |
| 139 | + } |
| 140 | + long end = start; |
133 | 141 | try { |
| 142 | + log.info("Indexing articles..."); |
| 143 | + IndexId ll = iid.getLinks(); |
| 144 | + Links links = Links.openForRead(ll,ll.getImportPath()); |
| 145 | + // read |
| 146 | + DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,links,langCode); |
| 147 | + XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 1000)); |
134 | 148 | reader.readDump(); |
| 149 | + log.info("Closing/optimizing index..."); |
| 150 | + dp.closeIndex(); |
| 151 | + end = System.currentTimeMillis(); |
| 152 | + System.out.println("Cache stats: "+links.getCache().getStats()); |
135 | 153 | } catch (IOException e) { |
136 | 154 | if(!e.getMessage().equals("stopped")){ |
137 | | - log.fatal("I/O error reading dump for "+dbname+" from "+inputfile); |
| 155 | + log.fatal("I/O error processing dump for "+dbname+" from "+inputfile+" : "+e.getMessage()); |
| 156 | + e.printStackTrace(); |
138 | 157 | return; |
139 | 158 | } |
140 | | - } |
141 | | - |
142 | | - long end = System.currentTimeMillis(); |
143 | | - |
144 | | - log.info("Closing/optimizing index..."); |
145 | | - try{ |
146 | | - dp.closeIndex(); |
147 | | - } catch(IOException e){ |
148 | | - e.printStackTrace(); |
149 | | - log.fatal("Cannot close/optimize index : "+e.getMessage()); |
150 | 159 | System.exit(1); |
151 | 160 | } |
152 | | - |
| 161 | + |
153 | 162 | long finalEnd = System.currentTimeMillis(); |
154 | 163 | |
155 | 164 | System.out.println("Finished indexing in "+formatTime(end-start)+", with final index optimization in "+formatTime(finalEnd-end)); |
— | — | @@ -168,6 +177,16 @@ |
169 | 178 | } else |
170 | 179 | IndexThread.makeIndexSnapshot(iid,iid.getImportPath()); |
171 | 180 | } |
| 181 | + |
| 182 | + // some cache stats |
| 183 | + /*Cache cache = CacheManager.create().getCache("links"); |
| 184 | + Statistics s = cache.getStatistics(); |
| 185 | + |
| 186 | + long hit = s.getCacheHits(); |
| 187 | + long miss = s.getCacheMisses(); |
| 188 | + |
| 189 | + System.out.println("Cache stats: hits = "+hit+", miss = "+miss); */ |
| 190 | + |
172 | 191 | } |
173 | 192 | |
174 | 193 | private static String formatTime(long l) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/RelatedStorage.java |
— | — | @@ -35,7 +35,7 @@ |
36 | 36 | StringList sl = new StringList(CompactRelated.convertToStringList(rel)); |
37 | 37 | Document doc = new Document(); |
38 | 38 | doc.add(new Field("key",key,Field.Store.YES,Field.Index.UN_TOKENIZED)); |
39 | | - doc.add(new Field("related",sl.toString(),Field.Store.YES,Field.Index.NO)); |
| 39 | + doc.add(new Field("related",sl.toString(),Field.Store.COMPRESS,Field.Index.NO)); |
40 | 40 | writer.addDocument(doc); |
41 | 41 | } |
42 | 42 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java |
— | — | @@ -32,17 +32,20 @@ |
33 | 33 | * @author rainman |
34 | 34 | * |
35 | 35 | */ |
| 36 | +@Deprecated |
36 | 37 | public class LinkAnalysisStorage extends LuceneStorage { |
37 | 38 | static Logger log = Logger.getLogger(LinkAnalysisStorage.class); |
38 | 39 | protected SetBasedFieldSelector selRef; |
39 | 40 | |
40 | 41 | public LinkAnalysisStorage(IndexId iid){ |
41 | | - super(iid.getLinkAnalysis()); |
| 42 | + //super(iid.getLinkAnalysis()); |
| 43 | + super(iid); |
42 | 44 | init(); |
43 | 45 | } |
44 | 46 | |
45 | 47 | public LinkAnalysisStorage(IndexId iid, String path){ |
46 | | - super(iid.getLinkAnalysis(),path); |
| 48 | + //super(iid.getLinkAnalysis(),path); |
| 49 | + super(iid,path); |
47 | 50 | init(); |
48 | 51 | } |
49 | 52 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java |
— | — | @@ -58,7 +58,7 @@ |
59 | 59 | /** If true, this machine is an indexer for this index */ |
60 | 60 | protected boolean myIndex; |
61 | 61 | |
62 | | - protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINK_ANALYSIS, RELATED, PREFIX }; |
| 62 | + protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINKS, RELATED, PREFIX, PREFIX_TITLES }; |
63 | 63 | |
64 | 64 | /** Type of index, enumeration */ |
65 | 65 | protected IndexType type; |
— | — | @@ -95,6 +95,9 @@ |
96 | 96 | protected String OAIRepository; |
97 | 97 | |
98 | 98 | protected String rsyncSnapshotPath = null; |
| 99 | + |
| 100 | + /** language code, e.g. "en" */ |
| 101 | + protected String langCode = null; |
99 | 102 | |
100 | 103 | /** |
101 | 104 | * Get index Id object given it's string representation, the actual object |
— | — | @@ -105,7 +108,10 @@ |
106 | 109 | * @return |
107 | 110 | */ |
108 | 111 | static public IndexId get(String dbrole){ |
109 | | - return GlobalConfiguration.getIndexId(dbrole); |
| 112 | + IndexId ret = GlobalConfiguration.getIndexId(dbrole); |
| 113 | + if(ret == null) |
| 114 | + throw new RuntimeException("Index "+dbrole+" doesn't exist"); |
| 115 | + return ret; |
110 | 116 | } |
111 | 117 | |
112 | 118 | /** |
— | — | @@ -158,12 +164,14 @@ |
159 | 165 | this.type = IndexType.NSSPLIT; |
160 | 166 | else if(type.equals("spell")) |
161 | 167 | this.type = IndexType.SPELL; |
162 | | - else if(type.equals("link_analysis")) |
163 | | - this.type = IndexType.LINK_ANALYSIS; |
| 168 | + else if(type.equals("links")) |
| 169 | + this.type = IndexType.LINKS; |
164 | 170 | else if(type.equals("related")) |
165 | 171 | this.type = IndexType.RELATED; |
166 | 172 | else if(type.equals("prefix")) |
167 | 173 | this.type = IndexType.PREFIX; |
| 174 | + else if(type.equals("prefix_titles")) |
| 175 | + this.type = IndexType.PREFIX_TITLES; |
168 | 176 | |
169 | 177 | // parts |
170 | 178 | String[] parts = dbrole.split("\\."); |
— | — | @@ -259,9 +267,9 @@ |
260 | 268 | public boolean isSpell(){ |
261 | 269 | return type == IndexType.SPELL; |
262 | 270 | } |
263 | | - /** If this is the link-analysis index */ |
264 | | - public boolean isLinkAnalysis(){ |
265 | | - return type == IndexType.LINK_ANALYSIS; |
| 271 | + /** If this is the index storing pagelinks */ |
| 272 | + public boolean isLinks(){ |
| 273 | + return type == IndexType.LINKS; |
266 | 274 | } |
267 | 275 | /** If this is the index storing info about related articles */ |
268 | 276 | public boolean isRelated(){ |
— | — | @@ -271,6 +279,10 @@ |
272 | 280 | public boolean isPrefix(){ |
273 | 281 | return type == IndexType.PREFIX; |
274 | 282 | } |
| 283 | + /** If this is the index storing titles for the prefix index */ |
| 284 | + public boolean isPrefixTitles(){ |
| 285 | + return type == IndexType.PREFIX_TITLES; |
| 286 | + } |
275 | 287 | |
276 | 288 | /** If this is a split index, returns the current part number, e.g. for entest.part4 will return 4 */ |
277 | 289 | public int getPartNum() { |
— | — | @@ -418,7 +430,7 @@ |
419 | 431 | |
420 | 432 | /** get all hosts that search db this iid belongs to */ |
421 | 433 | public HashSet<String> getDBSearchHosts(){ |
422 | | - if(isSingle() || isSpell() || isLinkAnalysis() || isRelated() || isPrefix()) |
| 434 | + if(isSingle() || isSpell() || isLinks() || isRelated() || isPrefix() || isPrefixTitles()) |
423 | 435 | return searchHosts; |
424 | 436 | else{ |
425 | 437 | // add all hosts that search: dbname and all parts |
— | — | @@ -469,7 +481,7 @@ |
470 | 482 | */ |
471 | 483 | public HashSet<String> getPhysicalIndexes() { |
472 | 484 | HashSet<String> ret = new HashSet<String>(); |
473 | | - if(isSingle() || isSpell() || isLinkAnalysis() || isRelated() || isPrefix()) |
| 485 | + if(isSingle() || isSpell() || isLinks() || isRelated() || isPrefix() || isPrefixTitles()) |
474 | 486 | ret.add(dbrole); |
475 | 487 | else if(isMainsplit() || isSplit() || isNssplit()){ |
476 | 488 | for(String p : splitParts) |
— | — | @@ -545,9 +557,9 @@ |
546 | 558 | return get(dbname+".spell"); |
547 | 559 | } |
548 | 560 | |
549 | | - /** Get the link analysis iid */ |
550 | | - public IndexId getLinkAnalysis() { |
551 | | - return get(dbname+".link_analysis"); |
| 561 | + /** Get the pagelinks iid */ |
| 562 | + public IndexId getLinks() { |
| 563 | + return get(dbname+".links"); |
552 | 564 | } |
553 | 565 | |
554 | 566 | /** Get the related-articles index iid */ |
— | — | @@ -560,6 +572,17 @@ |
561 | 573 | return get(dbname+".prefix"); |
562 | 574 | } |
563 | 575 | |
| 576 | + /** Get the prefix titles index iid */ |
| 577 | + public IndexId getPrefixTitles() { |
| 578 | + return get(dbname+".prefix_titles"); |
| 579 | + } |
564 | 580 | |
| 581 | + /** Get language code for this db, e.g. "en" */ |
| 582 | + public String getLangCode(){ |
| 583 | + if(langCode == null) |
| 584 | + langCode = GlobalConfiguration.getInstance().getLanguage(dbname); |
| 585 | + return langCode; |
| 586 | + } |
| 587 | + |
565 | 588 | |
566 | 589 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -186,10 +186,12 @@ |
187 | 187 | } |
188 | 188 | } |
189 | 189 | // add the link analysis to indexers |
190 | | - if(!types.contains("link_analysis")) |
191 | | - database.get(dbname).put("link_analysis",new Hashtable<String,String>()); |
| 190 | + if(!types.contains("links")) |
| 191 | + database.get(dbname).put("links",new Hashtable<String,String>()); |
192 | 192 | if(!types.contains("related")) |
193 | 193 | database.get(dbname).put("related",new Hashtable<String,String>()); |
| 194 | + if(!types.contains("prefix_titles")) |
| 195 | + database.get(dbname).put("prefix_titles",new Hashtable<String,String>()); |
194 | 196 | } |
195 | 197 | // expand logical index names on searchers |
196 | 198 | for(String host : search.keySet()){ |
— | — | @@ -232,7 +234,7 @@ |
233 | 235 | } else if(typeid.matches("nspart[1-9][0-9]*")){ |
234 | 236 | type = "nssplit"; |
235 | 237 | dbrole = dbname + "." + typeid; |
236 | | - } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related") || typeid.equals("prefix")){ |
| 238 | + } else if(typeid.equals("spell") || typeid.equals("links") || typeid.equals("related") || typeid.equals("prefix") || typeid.equals("prefix_titles")){ |
237 | 239 | type = typeid; |
238 | 240 | dbrole = dbname + "." + typeid; |
239 | 241 | } else |
— | — | @@ -252,7 +254,7 @@ |
253 | 255 | } |
254 | 256 | boolean searched = (getSearchHosts(dbrole).size() != 0); |
255 | 257 | if(!searched && !(typeid.equals("mainsplit") || typeid.equals("split") |
256 | | - || typeid.equals("nssplit") || typeid.equals("link_analysis") || typeid.equals("related"))){ |
| 258 | + || typeid.equals("nssplit") || typeid.equals("links") || typeid.equals("related") || typeid.equals("prefix_titles"))){ |
257 | 259 | if(verbose) |
258 | 260 | System.out.println("WARNING: in Global Configuration: index "+dbrole+" is not searched by any host."); |
259 | 261 | } |
— | — | @@ -519,7 +521,7 @@ |
520 | 522 | } else if(typeid.matches("nspart[1-9][0-9]*")){ |
521 | 523 | type = "nssplit"; |
522 | 524 | dbrole = dbname + "." + typeid; |
523 | | - } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related") || typeid.equals("prefix")){ |
| 525 | + } else if(typeid.equals("spell") || typeid.equals("links") || typeid.equals("related") || typeid.equals("prefix") || typeid.equals("prefix_titles")){ |
524 | 526 | type = typeid; |
525 | 527 | dbrole = dbname + "." + typeid; |
526 | 528 | } else |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -5,6 +5,7 @@ |
6 | 6 | import java.net.URI; |
7 | 7 | import java.text.MessageFormat; |
8 | 8 | import java.util.ArrayList; |
| 9 | +import java.util.Collection; |
9 | 10 | import java.util.HashMap; |
10 | 11 | import java.util.HashSet; |
11 | 12 | import java.util.Hashtable; |
— | — | @@ -31,14 +32,19 @@ |
32 | 33 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
33 | 34 | import org.wikimedia.lsearch.beans.ResultSet; |
34 | 35 | import org.wikimedia.lsearch.beans.SearchResults; |
| 36 | +import org.wikimedia.lsearch.beans.Title; |
35 | 37 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
36 | 38 | import org.wikimedia.lsearch.config.IndexId; |
37 | 39 | import org.wikimedia.lsearch.frontend.SearchDaemon; |
38 | 40 | import org.wikimedia.lsearch.frontend.SearchServer; |
39 | 41 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
| 42 | +import org.wikimedia.lsearch.ranks.Links; |
40 | 43 | import org.wikimedia.lsearch.ranks.StringList; |
| 44 | +import org.wikimedia.lsearch.related.Related; |
| 45 | +import org.wikimedia.lsearch.related.RelatedTitle; |
41 | 46 | import org.wikimedia.lsearch.spell.Suggest; |
42 | 47 | import org.wikimedia.lsearch.spell.SuggestQuery; |
| 48 | +import org.wikimedia.lsearch.util.Localization; |
43 | 49 | import org.wikimedia.lsearch.util.QueryStringMap; |
44 | 50 | |
45 | 51 | /** |
— | — | @@ -54,6 +60,7 @@ |
55 | 61 | protected final int maxlines = 1000; |
56 | 62 | protected final int maxoffset = 10000; |
57 | 63 | protected static GlobalConfiguration global = null; |
| 64 | + protected static Hashtable<String,Hashtable<String,Integer>> dbNamespaces = new Hashtable<String,Hashtable<String,Integer>>(); |
58 | 65 | |
59 | 66 | public SearchEngine(){ |
60 | 67 | if(global == null) |
— | — | @@ -102,17 +109,87 @@ |
103 | 110 | // TODO: return searchTitles(searchterm); |
104 | 111 | } else if (what.equals("prefix")){ |
105 | 112 | return prefixSearch(iid, searchterm); |
| 113 | + } else if (what.equals("related")){ |
| 114 | + int offset = 0, limit = 100; boolean exactCase = false; |
| 115 | + if (query.containsKey("offset")) |
| 116 | + offset = Math.max(Integer.parseInt((String)query.get("offset")), 0); |
| 117 | + if (query.containsKey("limit")) |
| 118 | + limit = Math.min(Integer.parseInt((String)query.get("limit")), maxlines); |
| 119 | + return relatedSearch(iid, searchterm, offset, limit); |
106 | 120 | } else { |
107 | 121 | SearchResults res = new SearchResults(); |
108 | 122 | res.setErrorMsg("Unrecognized search type. Try one of: " + |
109 | | - "search, explain, raw, rawexplain, prefix."); |
| 123 | + "search, explain, raw, rawexplain, prefix, related."); |
110 | 124 | log.warn("Unknown request type [" + what + "]."); |
111 | 125 | return res; |
112 | 126 | } |
113 | 127 | return null; |
114 | 128 | } |
115 | 129 | |
116 | | - private SearchResults prefixSearch(IndexId iid, String searchterm) { |
| 130 | + /** Convert User:Rainman into 2:Rainman */ |
| 131 | + protected String getKey(String title, IndexId iid){ |
| 132 | + int colon = title.indexOf(':'); |
| 133 | + if(colon != -1 && colon != title.length()-1){ |
| 134 | + String ns = title.substring(0,colon); |
| 135 | + Integer inx = dbNamespaces.get(iid.getDBname()).get(ns.toLowerCase()); |
| 136 | + if(inx != null){ |
| 137 | + return inx +":"+ title.substring(colon+1); |
| 138 | + } |
| 139 | + } |
| 140 | + |
| 141 | + return "0:" + title; |
| 142 | + } |
| 143 | + |
| 144 | + protected SearchResults relatedSearch(IndexId iid, String searchterm, int offset, int limit) { |
| 145 | + readLocalization(iid); |
| 146 | + IndexId rel = iid.getRelated(); |
| 147 | + IndexId lin = iid.getLinks(); |
| 148 | + SearcherCache cache = SearcherCache.getInstance(); |
| 149 | + SearchResults res = new SearchResults(); |
| 150 | + try { |
| 151 | + IndexSearcherMul searcher = cache.getLocalSearcher(rel); |
| 152 | + IndexReader reader = searcher.getIndexReader(); |
| 153 | + String key = getKey(searchterm,iid); |
| 154 | + TermDocs td = reader.termDocs(new Term("key",key)); |
| 155 | + if(td.next()){ |
| 156 | + ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection()); |
| 157 | + res.setNumHits(col.size()); |
| 158 | + res.setSuccess(true); |
| 159 | + // TODO: this is extremely slow |
| 160 | + Links links = Links.openForRead(lin,lin.getSearchPath()); |
| 161 | + for(int i=offset;i<offset+limit && i<col.size();i++){ |
| 162 | + RelatedTitle rt = col.get(i); |
| 163 | + Title t = rt.getRelated(); |
| 164 | + ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle()); |
| 165 | + rs.addContext(links.getContext(t.getKey(),key)); |
| 166 | + res.addResult(rs); |
| 167 | + } |
| 168 | + } else{ |
| 169 | + res.setSuccess(true); |
| 170 | + res.setNumHits(0); |
| 171 | + } |
| 172 | + } catch (IOException e) { |
| 173 | + e.printStackTrace(); |
| 174 | + log.error("I/O error in relatedSearch on "+rel+" : "+e.getMessage()); |
| 175 | + res.setErrorMsg("I/O Error processing index for "+rel); |
| 176 | + } |
| 177 | + return res; |
| 178 | + } |
| 179 | + |
| 180 | + protected void readLocalization(IndexId iid){ |
| 181 | + if(!dbNamespaces.containsKey(iid.getDBname())){ |
| 182 | + synchronized(dbNamespaces){ |
| 183 | + HashMap<String,Integer> m = Localization.getLocalizedNamespaces(iid.getLangCode(),iid.getDBname()); |
| 184 | + Hashtable<String,Integer> map = new Hashtable<String,Integer>(); |
| 185 | + if(m != null) |
| 186 | + map.putAll(m); |
| 187 | + dbNamespaces.put(iid.getDBname(),map); |
| 188 | + } |
| 189 | + } |
| 190 | + } |
| 191 | + |
| 192 | + protected SearchResults prefixSearch(IndexId iid, String searchterm) { |
| 193 | + readLocalization(iid); |
117 | 194 | IndexId pre = iid.getPrefix(); |
118 | 195 | SearcherCache cache = SearcherCache.getInstance(); |
119 | 196 | SearchResults res = new SearchResults(); |
— | — | @@ -144,7 +221,8 @@ |
145 | 222 | } |
146 | 223 | } catch (IOException e) { |
147 | 224 | // res.setErrorMsg("Internal error during prefix search: "+e.getMessage()); |
148 | | - log.error("Internal error in SearchEngine::prefixSearch : "+e.getMessage()); |
| 225 | + log.error("Internal error in prefixSearch on "+pre+" : "+e.getMessage()); |
| 226 | + res.setErrorMsg("I/O error on index "+pre); |
149 | 227 | } |
150 | 228 | return res; |
151 | 229 | } |
— | — | @@ -166,9 +244,10 @@ |
167 | 245 | localfilter = null; |
168 | 246 | if(localfilter != null) |
169 | 247 | log.info("Using local filter: "+localfilter); |
170 | | - Hits hits = searcher.search(q,localfilter); |
| 248 | + TopDocs hits = searcher.search(q,localfilter,offset+limit); |
171 | 249 | return makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain); |
172 | 250 | } catch (IOException e) { |
| 251 | + e.printStackTrace(); |
173 | 252 | SearchResults res = new SearchResults(); |
174 | 253 | res.setErrorMsg("Internal error in SearchEngine: "+e.getMessage()); |
175 | 254 | log.error("Internal error in SearchEngine while trying to search main part: "+e.getMessage()); |
— | — | @@ -186,7 +265,7 @@ |
187 | 266 | if(nsDefault == null || nsDefault.cardinality() == 0) |
188 | 267 | nsDefault = new NamespaceFilter("0"); // default to main namespace |
189 | 268 | FieldBuilder.Case dCase = exactCase? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE; |
190 | | - FieldBuilder.BuilderSet bs = new FieldBuilder(global.getLanguage(iid.getDBname()),dCase).getBuilder(dCase); |
| 269 | + FieldBuilder.BuilderSet bs = new FieldBuilder(iid,dCase).getBuilder(dCase); |
191 | 270 | ArrayList<String> stopWords = null; |
192 | 271 | try{ |
193 | 272 | stopWords = StopWords.getCached(iid); |
— | — | @@ -354,7 +433,8 @@ |
355 | 434 | |
356 | 435 | /** Our scores can span several orders of magnitude, transform them to be more relevant to the user */ |
357 | 436 | public float transformScore(double score){ |
358 | | - return (float) (Math.log10(1+score*99)/2); |
| 437 | + //return (float) (Math.log10(1+score*99)/2); |
| 438 | + return (float) score; |
359 | 439 | } |
360 | 440 | |
361 | 441 | protected SearchResults makeSearchResults(SearchableMul s, TopDocs hits, int offset, int limit, IndexId iid, String searchterm, Query q, long searchStart, boolean explain) throws IOException{ |
— | — | @@ -375,14 +455,15 @@ |
376 | 456 | // fetch documents |
377 | 457 | Document[] docs = s.docs(docids); |
378 | 458 | int j=0; |
379 | | - float maxScore = hits.getMaxScore(); |
| 459 | + //float maxScore = hits.getMaxScore(); |
| 460 | + float maxScore = 1; |
380 | 461 | for(Document doc : docs){ |
381 | 462 | String namespace = doc.get("namespace"); |
382 | 463 | String title = doc.get("title"); |
383 | 464 | float score = transformScore(scores[j]/maxScore); |
384 | 465 | ResultSet rs = new ResultSet(score,namespace,title); |
385 | 466 | if(explain) |
386 | | - rs.setExplanation(((WikiSearcher)s).explain(q,docids[j])); |
| 467 | + rs.setExplanation(((Searcher)s).explain(q,docids[j])); |
387 | 468 | res.addResult(rs); |
388 | 469 | j++; |
389 | 470 | } |
— | — | @@ -410,8 +491,8 @@ |
411 | 492 | Document[] docs = s.docs(docids); |
412 | 493 | int j=0; |
413 | 494 | float maxScore = 1; |
414 | | - if(numhits>0) |
415 | | - maxScore = hits.score(0); |
| 495 | + //if(numhits>0) |
| 496 | + // maxScore = hits.score(0); |
416 | 497 | for(Document doc : docs){ |
417 | 498 | String namespace = doc.get("namespace"); |
418 | 499 | String title = doc.get("title"); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankValueSourceQuery.java |
— | — | @@ -0,0 +1,178 @@ |
| 2 | +package org.wikimedia.lsearch.search; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.Set; |
| 6 | + |
| 7 | +import org.apache.lucene.index.IndexReader; |
| 8 | +import org.apache.lucene.search.ComplexExplanation; |
| 9 | +import org.apache.lucene.search.Explanation; |
| 10 | +import org.apache.lucene.search.Query; |
| 11 | +import org.apache.lucene.search.Scorer; |
| 12 | +import org.apache.lucene.search.Searcher; |
| 13 | +import org.apache.lucene.search.Similarity; |
| 14 | +import org.apache.lucene.search.Weight; |
| 15 | +import org.apache.lucene.search.function.DocValues; |
| 16 | +import org.apache.lucene.search.function.ValueSource; |
| 17 | +import org.apache.lucene.search.function.ValueSourceQuery; |
| 18 | +import org.apache.lucene.util.ToStringUtils; |
| 19 | + |
| 20 | +public class RankValueSourceQuery extends ValueSourceQuery { |
| 21 | + protected ValueSource valSrc; |
| 22 | + /** |
| 23 | + * Create a value source query |
| 24 | + * @param valSrc provides the values defines the function to be used for scoring |
| 25 | + */ |
| 26 | + public RankValueSourceQuery(ValueSource valSrc) { |
| 27 | + super(valSrc); |
| 28 | + this.valSrc = valSrc; |
| 29 | + } |
| 30 | + |
| 31 | + /*(non-Javadoc) @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) */ |
| 32 | + public Query rewrite(IndexReader reader) throws IOException { |
| 33 | + return this; |
| 34 | + } |
| 35 | + |
| 36 | + /*(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(java.util.Set) */ |
| 37 | + public void extractTerms(Set terms) { |
| 38 | + // no terms involved here |
| 39 | + } |
| 40 | + |
| 41 | + private class ValueSourceWeight implements Weight { |
| 42 | + Searcher searcher; |
| 43 | + float queryNorm; |
| 44 | + float queryWeight; |
| 45 | + |
| 46 | + public ValueSourceWeight(Searcher searcher) { |
| 47 | + this.searcher = searcher; |
| 48 | + } |
| 49 | + |
| 50 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#getQuery() */ |
| 51 | + public Query getQuery() { |
| 52 | + return RankValueSourceQuery.this; |
| 53 | + } |
| 54 | + |
| 55 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() */ |
| 56 | + public float getValue() { |
| 57 | + return queryWeight; |
| 58 | + } |
| 59 | + |
| 60 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() */ |
| 61 | + public float sumOfSquaredWeights() throws IOException { |
| 62 | + queryWeight = getBoost(); |
| 63 | + return queryWeight * queryWeight; |
| 64 | + } |
| 65 | + |
| 66 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) */ |
| 67 | + public void normalize(float norm) { |
| 68 | + this.queryNorm = 1; |
| 69 | + queryWeight *= this.queryNorm; |
| 70 | + } |
| 71 | + |
| 72 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#scorer(org.apache.lucene.index.IndexReader) */ |
| 73 | + public Scorer scorer(IndexReader reader) throws IOException { |
| 74 | + return new ValueSourceScorer(getSimilarity(searcher), reader, this); |
| 75 | + } |
| 76 | + |
| 77 | + /*(non-Javadoc) @see org.apache.lucene.search.Weight#explain(org.apache.lucene.index.IndexReader, int) */ |
| 78 | + public Explanation explain(IndexReader reader, int doc) throws IOException { |
| 79 | + return scorer(reader).explain(doc); |
| 80 | + } |
| 81 | + } |
| 82 | + |
| 83 | + /** |
| 84 | + * A scorer that (simply) matches all documents, and scores each document with |
| 85 | + * the value of the value soure in effect. As an example, if the value source |
| 86 | + * is a (cached) field source, then value of that field in that document will |
| 87 | + * be used. (assuming field is indexed for this doc, with a single token.) |
| 88 | + */ |
| 89 | + private class ValueSourceScorer extends Scorer { |
| 90 | + private final IndexReader reader; |
| 91 | + private final ValueSourceWeight weight; |
| 92 | + private final int maxDoc; |
| 93 | + private final float qWeight; |
| 94 | + private int doc=-1; |
| 95 | + private final DocValues vals; |
| 96 | + |
| 97 | + // constructor |
| 98 | + private ValueSourceScorer(Similarity similarity, IndexReader reader, ValueSourceWeight w) throws IOException { |
| 99 | + super(similarity); |
| 100 | + this.weight = w; |
| 101 | + this.qWeight = w.getValue(); |
| 102 | + this.reader = reader; |
| 103 | + this.maxDoc = reader.maxDoc(); |
| 104 | + // this is when/where the values are first created. |
| 105 | + vals = valSrc.getValues(reader); |
| 106 | + } |
| 107 | + |
| 108 | + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#next() */ |
| 109 | + public boolean next() throws IOException { |
| 110 | + for(;;) { |
| 111 | + ++doc; |
| 112 | + if (doc>=maxDoc) { |
| 113 | + return false; |
| 114 | + } |
| 115 | + if (reader.isDeleted(doc)) { |
| 116 | + continue; |
| 117 | + } |
| 118 | + return true; |
| 119 | + } |
| 120 | + } |
| 121 | + |
| 122 | + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#doc() |
| 123 | + */ |
| 124 | + public int doc() { |
| 125 | + return doc; |
| 126 | + } |
| 127 | + |
| 128 | + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#score() */ |
| 129 | + public float score() throws IOException { |
| 130 | + return qWeight * vals.floatVal(doc); |
| 131 | + } |
| 132 | + |
| 133 | + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#skipTo(int) */ |
| 134 | + public boolean skipTo(int target) throws IOException { |
| 135 | + doc=target-1; |
| 136 | + return next(); |
| 137 | + } |
| 138 | + |
| 139 | + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#explain(int) */ |
| 140 | + public Explanation explain(int doc) throws IOException { |
| 141 | + float sc = qWeight * vals.floatVal(doc); |
| 142 | + |
| 143 | + Explanation result = new ComplexExplanation( |
| 144 | + true, sc, RankValueSourceQuery.this.toString() + ", product of:"); |
| 145 | + |
| 146 | + result.addDetail(vals.explain(doc)); |
| 147 | + result.addDetail(new Explanation(getBoost(), "boost")); |
| 148 | + result.addDetail(new Explanation(weight.queryNorm,"queryNorm")); |
| 149 | + return result; |
| 150 | + } |
| 151 | + } |
| 152 | + |
| 153 | + /*(non-Javadoc) @see org.apache.lucene.search.Query#createWeight(org.apache.lucene.search.Searcher) */ |
| 154 | + protected Weight createWeight(Searcher searcher) { |
| 155 | + return new RankValueSourceQuery.ValueSourceWeight(searcher); |
| 156 | + } |
| 157 | + |
| 158 | + /* (non-Javadoc) @see org.apache.lucene.search.Query#toString(java.lang.String) */ |
| 159 | + public String toString(String field) { |
| 160 | + return valSrc.toString() + ToStringUtils.boost(getBoost()); |
| 161 | + } |
| 162 | + |
| 163 | + /** Returns true if <code>o</code> is equal to this. */ |
| 164 | + public boolean equals(Object o) { |
| 165 | + if (getClass() != o.getClass()) { |
| 166 | + return false; |
| 167 | + } |
| 168 | + RankValueSourceQuery other = (RankValueSourceQuery)o; |
| 169 | + return this.getBoost() == other.getBoost() |
| 170 | + && this.valSrc.equals(other.valSrc); |
| 171 | + } |
| 172 | + |
| 173 | + /** Returns a hash code value for this object. */ |
| 174 | + public int hashCode() { |
| 175 | + return (getClass().hashCode() + valSrc.hashCode()) ^ Float.floatToIntBits(getBoost()); |
| 176 | + } |
| 177 | + |
| 178 | + |
| 179 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankDocValues.java |
— | — | @@ -0,0 +1,36 @@ |
| 2 | +package org.wikimedia.lsearch.search; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | + |
| 6 | +import org.apache.lucene.index.CorruptIndexException; |
| 7 | +import org.apache.lucene.index.IndexReader; |
| 8 | +import org.apache.lucene.index.TermDocs; |
| 9 | +import org.apache.lucene.search.function.DocValues; |
| 10 | + |
| 11 | +public class RankDocValues extends DocValues { |
| 12 | + IndexReader reader; |
| 13 | + |
| 14 | + public RankDocValues(IndexReader reader){ |
| 15 | + super(reader.maxDoc()); |
| 16 | + this.reader = reader; |
| 17 | + } |
| 18 | + |
| 19 | + protected int getValue(int doc){ |
| 20 | + try{ |
| 21 | + return Integer.parseInt(reader.document(doc).get("rank")); |
| 22 | + } catch(IOException e){ |
| 23 | + return 0; |
| 24 | + } |
| 25 | + } |
| 26 | + |
| 27 | + @Override |
| 28 | + public float floatVal(int doc) { |
| 29 | + return getValue(doc); |
| 30 | + } |
| 31 | + |
| 32 | + @Override |
| 33 | + public String toString(int doc) { |
| 34 | + return "rank: "+getValue(doc); |
| 35 | + } |
| 36 | + |
| 37 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/RankValueSource.java |
— | — | @@ -0,0 +1,34 @@ |
| 2 | +package org.wikimedia.lsearch.search; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | + |
| 6 | +import org.apache.lucene.index.IndexReader; |
| 7 | +import org.apache.lucene.search.function.DocValues; |
| 8 | +import org.apache.lucene.search.function.ValueSource; |
| 9 | + |
| 10 | +public class RankValueSource extends ValueSource { |
| 11 | + |
| 12 | + @Override |
| 13 | + public String description() { |
| 14 | + return ""; |
| 15 | + } |
| 16 | + |
| 17 | + @Override |
| 18 | + public boolean equals(Object o) { |
| 19 | + if(o == this) |
| 20 | + return true; |
| 21 | + else |
| 22 | + return false; |
| 23 | + } |
| 24 | + |
| 25 | + @Override |
| 26 | + public DocValues getValues(IndexReader reader) throws IOException { |
| 27 | + return new RankDocValues(reader); |
| 28 | + } |
| 29 | + |
| 30 | + @Override |
| 31 | + public int hashCode() { |
| 32 | + return 0; |
| 33 | + } |
| 34 | + |
| 35 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java |
— | — | @@ -27,6 +27,7 @@ |
28 | 28 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
29 | 29 | import org.wikimedia.lsearch.interoperability.RMIServer; |
30 | 30 | import org.wikimedia.lsearch.util.Command; |
| 31 | +import org.wikimedia.lsearch.util.FSUtils; |
31 | 32 | |
32 | 33 | |
33 | 34 | /** |
— | — | @@ -179,16 +180,16 @@ |
180 | 181 | try{ |
181 | 182 | // if local, use cp -lr instead of rsync |
182 | 183 | if(global.isLocalhost(iid.getIndexHost())){ |
183 | | - Command.exec("/bin/cp -lr "+iid.getSnapshotPath()+sep+li.timestamp+" "+iid.getUpdatePath()); |
| 184 | + FSUtils.createHardLinkRecursive( |
| 185 | + iid.getSnapshotPath()+sep+li.timestamp, |
| 186 | + updatepath); |
184 | 187 | } else{ |
185 | 188 | File ind = new File(iid.getCanonicalSearchPath()); |
186 | 189 | |
187 | 190 | if(ind.exists()){ // prepare a local hard-linked copy of index |
188 | | - ind = ind.getCanonicalFile(); |
189 | | - for(File f: ind.listFiles()){ |
190 | | - // a cp -lr command for each file in the index |
191 | | - Command.exec("/bin/cp -lr "+ind.getCanonicalPath()+sep+f.getName()+" "+updatepath+sep+f.getName()); |
192 | | - } |
| 191 | + FSUtils.createHardLinkRecursive( |
| 192 | + ind.getCanonicalPath(), |
| 193 | + updatepath); |
193 | 194 | } |
194 | 195 | long startTime = System.currentTimeMillis(); |
195 | 196 | // rsync |
— | — | @@ -208,8 +209,8 @@ |
209 | 210 | SearcherCache.SearcherPool pool = new SearcherCache.SearcherPool(iid,li.path,cache.getSearchPoolSize()); |
210 | 211 | |
211 | 212 | // refresh the symlink |
212 | | - Command.exec("/bin/rm -rf "+iid.getSearchPath()); |
213 | | - Command.exec("/bin/ln -fs "+updatepath+" "+iid.getSearchPath()); |
| 213 | + FSUtils.delete(iid.getSearchPath()); |
| 214 | + FSUtils.createSymLink(updatepath,iid.getSearchPath()); |
214 | 215 | |
215 | 216 | // update registry, cache, rmi object |
216 | 217 | registry.refreshUpdates(iid); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java |
— | — | @@ -65,7 +65,7 @@ |
66 | 66 | /** Warmup index using some number of simple searches */ |
67 | 67 | protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) { |
68 | 68 | String lang = global.getLanguage(iid.getDBname()); |
69 | | - FieldBuilder.BuilderSet b = new FieldBuilder(lang).getBuilder(); |
| 69 | + FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder(); |
70 | 70 | WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null); |
71 | 71 | Terms terms = getTermsForLang(lang); |
72 | 72 | |
— | — | @@ -122,7 +122,7 @@ |
123 | 123 | public static void simpleWarmup(IndexSearcherMul is, IndexId iid){ |
124 | 124 | try{ |
125 | 125 | String lang = global.getLanguage(iid.getDBname()); |
126 | | - FieldBuilder.BuilderSet b = new FieldBuilder(lang).getBuilder(); |
| 126 | + FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder(); |
127 | 127 | WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null); |
128 | 128 | Query q = parser.parseFourPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
129 | 129 | is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0"))); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -27,6 +27,7 @@ |
28 | 28 | import org.apache.lucene.store.Directory; |
29 | 29 | import org.apache.lucene.store.FSDirectory; |
30 | 30 | import org.wikimedia.lsearch.analyzers.Analyzers; |
| 31 | +import org.wikimedia.lsearch.analyzers.ContextAnalyzer; |
31 | 32 | import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
32 | 33 | import org.wikimedia.lsearch.analyzers.FieldBuilder; |
33 | 34 | import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
— | — | @@ -41,6 +42,7 @@ |
42 | 43 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
43 | 44 | import org.wikimedia.lsearch.config.IndexId; |
44 | 45 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
| 46 | +import org.wikimedia.lsearch.ranks.Links; |
45 | 47 | import org.wikimedia.lsearch.related.RelatedTitle; |
46 | 48 | import org.wikimedia.lsearch.spell.api.SpellCheckIndexer; |
47 | 49 | import org.wikimedia.lsearch.util.Localization; |
— | — | @@ -169,7 +171,15 @@ |
170 | 172 | writer.setUseCompoundFile(true); |
171 | 173 | writer.setMaxFieldLength(MAX_FIELD_LENGTH); |
172 | 174 | FieldBuilder.Case dCase = (exactCase)? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE; |
173 | | - FieldBuilder builder = new FieldBuilder(langCode,dCase); |
| 175 | + FieldBuilder builder = new FieldBuilder(iid,dCase); |
| 176 | + // TODO: fixme |
| 177 | + Links links = null; |
| 178 | + try { |
| 179 | + links = Links.openForRead(iid,iid.getImportPath()); |
| 180 | + } catch (IOException e1) { |
| 181 | + // TODO Auto-generated catch block |
| 182 | + e1.printStackTrace(); |
| 183 | + } |
174 | 184 | |
175 | 185 | for(IndexUpdateRecord rec : records){ |
176 | 186 | if(rec.doAdd()){ |
— | — | @@ -178,7 +188,7 @@ |
179 | 189 | if(!checkPreconditions(rec)) |
180 | 190 | continue; // article shouldn't be added for some reason |
181 | 191 | IndexReportCard card = getReportCard(rec); |
182 | | - Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),builder,iid); |
| 192 | + Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),builder,iid,links); |
183 | 193 | Document doc = (Document) ret[0]; |
184 | 194 | Analyzer analyzer = (Analyzer) ret[1]; |
185 | 195 | try { |
— | — | @@ -400,9 +410,8 @@ |
401 | 411 | * @param languageAnalyzer |
402 | 412 | * @return array { document, analyzer } |
403 | 413 | */ |
404 | | - public static Object[] makeDocumentAndAnalyzer(Article article, FieldBuilder builder, IndexId iid){ |
| 414 | + public static Object[] makeDocumentAndAnalyzer(Article article, FieldBuilder builder, IndexId iid, Links links){ |
405 | 415 | PerFieldAnalyzerWrapper perFieldAnalyzer = null; |
406 | | - WikiTokenizer tokenizer = null; |
407 | 416 | Document doc = new Document(); |
408 | 417 | |
409 | 418 | // tranform record so that unnecessary stuff is deleted, e.g. some redirects |
— | — | @@ -463,8 +472,10 @@ |
464 | 473 | doc.add(contents); |
465 | 474 | |
466 | 475 | // related articles |
467 | | - p = makeRelated(doc,fields.related(),article,1); |
| 476 | + p = makeRelated(doc,fields.related(),article,1,fields.context()); |
468 | 477 | |
| 478 | + //makeContextField(doc,fields.context(),fields.related()); |
| 479 | + |
469 | 480 | // anchors |
470 | 481 | // makeKeywordField(doc,fields.anchor(),rankBoost); |
471 | 482 | |
— | — | @@ -479,7 +490,7 @@ |
480 | 491 | } |
481 | 492 | // make analyzer |
482 | 493 | String text = article.getContents(); |
483 | | - Object[] ret = Analyzers.getIndexerAnalyzer(text,builder,article.getRedirectKeywords(),article.getAnchorText(),article.getRelated(),p); |
| 494 | + Object[] ret = Analyzers.getIndexerAnalyzer(text,builder,article.getRedirectKeywords(),article.getAnchorText(),article.getRelated(),p,article.makeTitle(),links); |
484 | 495 | perFieldAnalyzer = (PerFieldAnalyzerWrapper) ret[0]; |
485 | 496 | |
486 | 497 | |
— | — | @@ -487,7 +498,7 @@ |
488 | 499 | } |
489 | 500 | |
490 | 501 | /** Returns partioning of related titles, or null if there aren't any */ |
491 | | - protected static int[] makeRelated(Document doc, String prefix, Article article, float boost) { |
| 502 | + protected static int[] makeRelated(Document doc, String prefix, Article article, float boost, String context) { |
492 | 503 | ArrayList<RelatedTitle> rel = article.getRelated(); |
493 | 504 | if(rel == null || rel.size()==0) |
494 | 505 | return null; |
— | — | @@ -501,14 +512,32 @@ |
502 | 513 | for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){ |
503 | 514 | Field relfield = new Field(prefix+i, "", |
504 | 515 | Field.Store.NO, Field.Index.TOKENIZED); |
505 | | - relfield.setBoost(boost*(float)MathFunc.avg(scores,p[i-1],p[i])); |
| 516 | + float fb = boost*(float)MathFunc.avg(scores,p[i-1],p[i]); |
| 517 | + relfield.setBoost(fb); |
506 | 518 | doc.add(relfield); |
| 519 | + if(i <= ContextAnalyzer.CONTEXT_GROUPS){ |
| 520 | + Field confield = new Field(context+i, "", |
| 521 | + Field.Store.NO, Field.Index.TOKENIZED); |
| 522 | + confield.setBoost(fb); // use same boost as related field |
| 523 | + doc.add(confield); |
| 524 | + } |
507 | 525 | } |
508 | 526 | |
509 | 527 | return p; |
510 | 528 | } |
511 | 529 | |
512 | | - /** Make a multiple keyword field, e.g. redirect1, redirect2, redirect3 ... */ |
| 530 | + /** Make a multiple context field ... */ |
| 531 | + protected static void makeContextField(Document doc, String prefix, String related) { |
| 532 | + for(int i=1;i<=ContextAnalyzer.CONTEXT_GROUPS;i++){ |
| 533 | + Field keyfield = new Field(prefix+i, "", |
| 534 | + Field.Store.NO, Field.Index.TOKENIZED); |
| 535 | + keyfield.setBoost(doc.getField(related+i).getBoost()); // use same boost as related field |
| 536 | + doc.add(keyfield); |
| 537 | + } |
| 538 | + |
| 539 | + } |
| 540 | + |
| 541 | + /** Make a multiple keyword field, e.g. keyword1, keyword2, keyword3 ... */ |
513 | 542 | protected static void makeKeywordField(Document doc, String prefix, float boost) { |
514 | 543 | for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){ |
515 | 544 | Field keyfield = new Field(prefix+i, "", |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiSimilarity.java |
— | — | @@ -39,7 +39,7 @@ |
40 | 40 | float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens)); |
41 | 41 | //log.debug("Length-norm: "+f+", numtokens: "+numTokens); |
42 | 42 | return f; |
43 | | - } else if(fieldName.startsWith("redirect") || fieldName.startsWith("keyword") || fieldName.startsWith("related") || fieldName.startsWith("anchor")){ |
| 43 | + } else if(fieldName.startsWith("redirect") || fieldName.startsWith("keyword") || fieldName.startsWith("related") || fieldName.startsWith("anchor") || fieldName.startsWith("context")){ |
44 | 44 | return 1; |
45 | 45 | } else |
46 | 46 | return super.lengthNorm(fieldName,numTokens); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java |
— | — | @@ -38,6 +38,7 @@ |
39 | 39 | import org.wikimedia.lsearch.config.IndexRegistry; |
40 | 40 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
41 | 41 | import org.wikimedia.lsearch.util.Command; |
| 42 | +import org.wikimedia.lsearch.util.FSUtils; |
42 | 43 | |
43 | 44 | /** |
44 | 45 | * Indexer. |
— | — | @@ -235,20 +236,6 @@ |
236 | 237 | } |
237 | 238 | } |
238 | 239 | |
239 | | - protected static void deleteDirRecursive(File file){ |
240 | | - if(!file.exists()) |
241 | | - return; |
242 | | - else if(file.isDirectory()){ |
243 | | - File[] files = file.listFiles(); |
244 | | - for(File f: files) |
245 | | - deleteDirRecursive(f); |
246 | | - file.delete(); |
247 | | - log.debug("Deleted old snapshot at "+file); |
248 | | - } else{ |
249 | | - file.delete(); |
250 | | - } |
251 | | - } |
252 | | - |
253 | 240 | /** |
254 | 241 | * Make a snapshot of all changed indexes |
255 | 242 | * |
— | — | @@ -296,20 +283,27 @@ |
297 | 284 | File[] files = spd.listFiles(); |
298 | 285 | for(File f: files){ |
299 | 286 | if(!f.getAbsolutePath().equals(li.path)) // leave the last snapshot |
300 | | - deleteDirRecursive(f); |
| 287 | + FSUtils.deleteRecursive(f); |
301 | 288 | } |
302 | 289 | } |
303 | 290 | new File(snapshot).mkdirs(); |
| 291 | + try { |
| 292 | + FSUtils.createHardLinkRecursive(indexPath,snapshot); |
| 293 | + } catch (IOException e) { |
| 294 | + log.error("Error making snapshot "+snapshot+": "+e.getMessage()); |
| 295 | + return; |
| 296 | + } |
| 297 | + /* |
304 | 298 | File ind =new File(indexPath); |
305 | 299 | for(File f: ind.listFiles()){ |
306 | | - // use a cp -lr command for each file in the index |
| 300 | + // hardlink the snapshot |
307 | 301 | try { |
308 | 302 | Command.exec("/bin/cp -lr "+indexPath+sep+f.getName()+" "+snapshot+sep+f.getName()); |
309 | 303 | } catch (IOException e) { |
310 | 304 | log.error("Error making snapshot "+snapshot+": "+e.getMessage()); |
311 | 305 | continue; |
312 | 306 | } |
313 | | - } |
| 307 | + } */ |
314 | 308 | log.info("Made snapshot "+snapshot); |
315 | 309 | } |
316 | 310 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/HighlightDaemon.java |
— | — | @@ -128,7 +128,7 @@ |
129 | 129 | FieldBuilder.Case dCase = exactCase? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE; |
130 | 130 | String lang = global.getLanguage(dbname); |
131 | 131 | Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase); |
132 | | - FieldBuilder.BuilderSet bs = new FieldBuilder(lang,dCase).getBuilder(dCase); |
| 132 | + FieldBuilder.BuilderSet bs = new FieldBuilder(iid,dCase).getBuilder(dCase); |
133 | 133 | WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(), |
134 | 134 | new NamespaceFilter("0"),analyzer,bs,WikiQueryParser.NamespacePolicy.IGNORE,null); |
135 | 135 | Query q = parser.parseFourPass(query,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
— | — | @@ -139,7 +139,7 @@ |
140 | 140 | |
141 | 141 | for(Article ar : articles){ |
142 | 142 | log.debug("Sending highlighted text for "+ar); |
143 | | - String clean = new CleanupParser(ar.getContents(),lang).parse(); |
| 143 | + String clean = new CleanupParser(ar.getContents(),iid).parse(); |
144 | 144 | TokenStream tokens = analyzer.tokenStream("contents",clean); |
145 | 145 | out.println("HIGHLIGHTING "+ar.getNamespace()+" "+ar.getTitle()); |
146 | 146 | String[] highlighted = highlighter.getBestFragments(tokens,clean,segments); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/CleanupParser.java |
— | — | @@ -3,6 +3,7 @@ |
4 | 4 | import java.util.HashSet; |
5 | 5 | import java.util.Hashtable; |
6 | 6 | |
| 7 | +import org.wikimedia.lsearch.config.IndexId; |
7 | 8 | import org.wikimedia.lsearch.util.Localization; |
8 | 9 | |
9 | 10 | /** |
— | — | @@ -34,6 +35,7 @@ |
35 | 36 | |
36 | 37 | /** language code */ |
37 | 38 | private String language; |
| 39 | + private IndexId iid; |
38 | 40 | /** language code -> set (image namespace names) */ |
39 | 41 | private static Hashtable<String,HashSet<String>> imageLocalized = new Hashtable<String,HashSet<String>>(); |
40 | 42 | /** language code -> set (category namespace names) */ |
— | — | @@ -47,10 +49,11 @@ |
48 | 50 | |
49 | 51 | enum FetchState { WORD, CATEGORY, INTERWIKI, KEYWORD }; |
50 | 52 | |
51 | | - public CleanupParser(String text, String lang){ |
| 53 | + public CleanupParser(String text, IndexId iid){ |
52 | 54 | this.text = text.toCharArray(); |
53 | 55 | this.textString = text; |
54 | | - this.language = lang; |
| 56 | + this.iid = iid; |
| 57 | + this.language = iid.getLangCode(); |
55 | 58 | textLength = text.length(); |
56 | 59 | out = new char[textLength]; |
57 | 60 | } |
— | — | @@ -409,7 +412,7 @@ |
410 | 413 | else if(language!=null && language.length()!=0){ |
411 | 414 | HashSet<String> loc = imageLocalized.get(language); |
412 | 415 | if(loc == null){ |
413 | | - loc = Localization.getLocalizedImage(language); |
| 416 | + loc = Localization.getLocalizedImage(language,iid.getDBname()); |
414 | 417 | imageLocalized.put(language,loc); |
415 | 418 | } |
416 | 419 | if(loc.contains(prefix)) |
— | — | @@ -426,7 +429,7 @@ |
427 | 430 | else if(language!=null && language.length()!=0){ |
428 | 431 | HashSet<String> loc = categoryLocalized.get(language); |
429 | 432 | if(loc == null){ |
430 | | - loc = Localization.getLocalizedCategory(language); |
| 433 | + loc = Localization.getLocalizedCategory(language,iid.getDBname()); |
431 | 434 | categoryLocalized.put(language,loc); |
432 | 435 | } |
433 | 436 | if(loc.contains(prefix)) |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java |
— | — | @@ -83,7 +83,7 @@ |
84 | 84 | Iterator it = info.Namespaces.orderedEntries(); |
85 | 85 | while(it.hasNext()){ |
86 | 86 | Entry<Integer,String> pair = (Entry<Integer,String>)it.next(); |
87 | | - Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode); |
| 87 | + Localization.addCustomMapping(pair.getValue(),pair.getKey(),iid.getDBname()); |
88 | 88 | } |
89 | 89 | } |
90 | 90 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -16,11 +16,17 @@ |
17 | 17 | import org.apache.lucene.queryParser.ParseException; |
18 | 18 | import org.apache.lucene.search.BooleanClause; |
19 | 19 | import org.apache.lucene.search.BooleanQuery; |
| 20 | +import org.apache.lucene.search.CustomBoostQuery; |
| 21 | +import org.apache.lucene.search.Explanation; |
20 | 22 | import org.apache.lucene.search.PhraseQuery; |
21 | 23 | import org.apache.lucene.search.Query; |
22 | 24 | import org.apache.lucene.search.TermQuery; |
23 | 25 | import org.apache.lucene.search.WildcardQuery; |
24 | 26 | import org.apache.lucene.search.BooleanClause.Occur; |
| 27 | +import org.apache.lucene.search.function.CustomScoreQuery; |
| 28 | +import org.apache.lucene.search.function.FieldScoreQuery; |
| 29 | +import org.apache.lucene.search.function.ValueSource; |
| 30 | +import org.apache.lucene.search.function.ValueSourceQuery; |
25 | 31 | import org.apache.lucene.search.spans.SpanNearQuery; |
26 | 32 | import org.apache.lucene.search.spans.SpanQuery; |
27 | 33 | import org.apache.lucene.search.spans.SpanTermQuery; |
— | — | @@ -28,6 +34,8 @@ |
29 | 35 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
30 | 36 | import org.wikimedia.lsearch.index.WikiIndexModifier; |
31 | 37 | import org.wikimedia.lsearch.search.NamespaceFilter; |
| 38 | +import org.wikimedia.lsearch.search.RankValueSource; |
| 39 | +import org.wikimedia.lsearch.search.RankValueSourceQuery; |
32 | 40 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
33 | 41 | |
34 | 42 | /** |
— | — | @@ -88,12 +96,13 @@ |
89 | 97 | public static float KEYWORD_BOOST = 0.02f; |
90 | 98 | public static float CONTENTS_BOOST = 0.2f; |
91 | 99 | |
92 | | - public static int ADDITIONAL_PHRASE_SLOP_CONTENTS = 20; |
| 100 | + public static int ADDITIONAL_PHRASE_SLOP_CONTENTS = 5000; |
93 | 101 | public static float ADDITIONAL_BOOST_CONTENTS = 0.5f; |
94 | | - public static int ADDITIONAL_PHRASE_SLOP_TITLE = 1; |
| 102 | + public static int ADDITIONAL_PHRASE_SLOP_TITLE = 0; |
95 | 103 | public static float ADDITIONAL_BOOST_TITLE = 0.5f; |
96 | | - public static int ADDITIONAL_PHRASE_SLOP_RELATED = 10; |
97 | | - public static float ADDITIONAL_BOOST_RELATED = 0.04f; |
| 104 | + public static int ADDITIONAL_PHRASE_SLOP_RELATED = 0; |
| 105 | + public static float ADDITIONAL_BOOST_RELATED = 0.2f; |
| 106 | + public static float ADDITIONAL_BOOST_CONTEXT = 0.05f; |
98 | 107 | |
99 | 108 | public static float WHOLE_TITLE_BOOST = 8f; |
100 | 109 | public static float EXACT_CONTENTS_BOOST = 1f; |
— | — | @@ -1422,11 +1431,30 @@ |
1423 | 1432 | pq.setSlop(slop); |
1424 | 1433 | return pq; |
1425 | 1434 | } |
1426 | | - |
| 1435 | + |
1427 | 1436 | /** Make phrase queries for additional scores */ |
1428 | 1437 | public Query makePhraseQueries(ArrayList<String> words, String field, int slop, float boost){ |
1429 | 1438 | if(words.size() <= 1) |
1430 | 1439 | return null; |
| 1440 | + else{ |
| 1441 | + PhraseQuery pq = new PhraseQuery(); |
| 1442 | + for(String w : words){ |
| 1443 | + if(!stopWords.contains(w)) |
| 1444 | + pq.add(new Term(field,w)); |
| 1445 | + } |
| 1446 | + pq.setSlop(slop); |
| 1447 | + pq.setBoost(boost); |
| 1448 | + return pq; |
| 1449 | + } |
| 1450 | + |
| 1451 | + } |
| 1452 | + |
| 1453 | + |
| 1454 | + /** Make phrase queries for additional scores */ |
| 1455 | + @Deprecated |
| 1456 | + public Query makePhraseQueriesOld(ArrayList<String> words, String field, int slop, float boost){ |
| 1457 | + if(words.size() <= 1) |
| 1458 | + return null; |
1431 | 1459 | else if(words.size() == 2){ |
1432 | 1460 | PhraseQuery pq = makePhrase(words,field,slop); |
1433 | 1461 | pq.setBoost(boost); |
— | — | @@ -1550,26 +1578,73 @@ |
1551 | 1579 | // skip last related group |
1552 | 1580 | Query[] pqr = new Query[RelatedAnalyzer.RELATED_GROUPS-1]; |
1553 | 1581 | for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){ |
1554 | | - pqr[i-1] = makePhraseQueries(words,"related"+i,ADDITIONAL_PHRASE_SLOP_RELATED,ADDITIONAL_BOOST_RELATED); |
| 1582 | + pqr[i-1] = makePhraseQueries(words,fields.related()+i,ADDITIONAL_PHRASE_SLOP_RELATED,ADDITIONAL_BOOST_RELATED); |
1555 | 1583 | } |
1556 | 1584 | Query[] wqr = new Query[RelatedAnalyzer.RELATED_GROUPS-1]; |
1557 | 1585 | for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){ |
1558 | | - wqr[i-1] = makeWordQueries(words,"related"+i,ADDITIONAL_BOOST_RELATED / 4); |
| 1586 | + wqr[i-1] = makeWordQueries(words,fields.related()+i,ADDITIONAL_BOOST_RELATED / 4); |
1559 | 1587 | } |
| 1588 | + Query[] pqx = new Query[ContextAnalyzer.CONTEXT_GROUPS]; |
| 1589 | + // make context queries |
| 1590 | + for(int i=1;i<=ContextAnalyzer.CONTEXT_GROUPS;i++){ |
| 1591 | + pqx[i-1] = makePhraseQueries(words,fields.context()+i,0,ADDITIONAL_BOOST_CONTEXT); |
| 1592 | + } |
1560 | 1593 | if(wt==null && pqc == null && pqt == null && pqr[0] == null && wqr[0] == null) |
1561 | 1594 | return bq; |
1562 | 1595 | // build the final query |
1563 | | - BooleanQuery finalQuery = new BooleanQuery(true); |
| 1596 | + BooleanQuery coreQuery = new BooleanQuery(true); |
1564 | 1597 | BooleanQuery additional = new BooleanQuery(true); |
| 1598 | + //BooleanQuery boostQuery = new BooleanQuery(true); |
1565 | 1599 | |
1566 | | - if(pqc != null) |
1567 | | - additional.add(pqc,Occur.MUST); |
| 1600 | + if(pqc != null){ |
| 1601 | + //additional.add(pqc,Occur.MUST); |
| 1602 | + additional.add(new CustomScoreQuery(pqc, new RankValueSourceQuery(new RankValueSource())){ |
| 1603 | + public float customScore(int doc, float subQueryScore, float valSrcScore) { |
| 1604 | + return (float) (subQueryScore * Math.log(Math.E+valSrcScore/15)); |
| 1605 | + } |
| 1606 | + public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation valSrcExpl) { |
| 1607 | + float valSrcScore = valSrcExpl==null ? 1 : valSrcExpl.getValue(); |
| 1608 | + Explanation exp = new Explanation( (float)Math.log(Math.E+valSrcScore/15) * subQueryExpl.getValue(), ": "+valSrcScore+" "+(float)Math.log(Math.E+valSrcScore/15)+"*"+subQueryExpl.getValue()+" custom score: product of:"); |
| 1609 | + exp.addDetail(subQueryExpl); |
| 1610 | + if (valSrcExpl != null) { |
| 1611 | + exp.addDetail(valSrcExpl); |
| 1612 | + } |
| 1613 | + return exp; |
| 1614 | + } |
| 1615 | + },Occur.MUST); |
| 1616 | + } |
1568 | 1617 | if(pqt != null) |
1569 | 1618 | additional.add(pqt,Occur.SHOULD); |
1570 | 1619 | if(wt != null) |
1571 | 1620 | additional.add(wt,Occur.SHOULD); |
1572 | | - if(wc != null) |
1573 | | - additional.add(wc,Occur.SHOULD); |
| 1621 | + if(wc != null){ |
| 1622 | + // additional.add(wc,Occur.SHOULD); |
| 1623 | + BooleanQuery boostExact = new BooleanQuery(); |
| 1624 | + for(Query q : pqr){ |
| 1625 | + if(q != null) |
| 1626 | + boostExact.add(q,Occur.SHOULD); |
| 1627 | + } |
| 1628 | + for(Query q : wqr){ |
| 1629 | + if(q != null) |
| 1630 | + boostExact.add(q,Occur.SHOULD); |
| 1631 | + } |
| 1632 | + CustomBoostQuery cbq = new CustomBoostQuery(wc,boostExact); |
| 1633 | + /*CustomScoreQuery csq = new CustomScoreQuery(cbq, new RankValueSourceQuery(new RankValueSource())) { |
| 1634 | + public float customScore(int doc, float subQueryScore, float valSrcScore) { |
| 1635 | + return (float) (subQueryScore * Math.log10(10+valSrcScore)); |
| 1636 | + } |
| 1637 | + public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation valSrcExpl) { |
| 1638 | + float valSrcScore = valSrcExpl==null ? 1 : valSrcExpl.getValue(); |
| 1639 | + Explanation exp = new Explanation( (float)Math.log10(10+valSrcScore) * subQueryExpl.getValue(), "custom score: product of:"); |
| 1640 | + exp.addDetail(subQueryExpl); |
| 1641 | + if (valSrcExpl != null) { |
| 1642 | + exp.addDetail(valSrcExpl); |
| 1643 | + } |
| 1644 | + return exp; |
| 1645 | + } |
| 1646 | + }; */ |
| 1647 | + additional.add(cbq,Occur.SHOULD); |
| 1648 | + } |
1574 | 1649 | for(Query q : pqr){ |
1575 | 1650 | if(q != null) |
1576 | 1651 | additional.add(q,Occur.SHOULD); |
— | — | @@ -1578,16 +1653,21 @@ |
1579 | 1654 | if(q != null) |
1580 | 1655 | additional.add(q,Occur.SHOULD); |
1581 | 1656 | } |
| 1657 | + /*for(Query q : pqx){ |
| 1658 | + if(q != null) |
| 1659 | + additional.add(q,Occur.SHOULD); |
| 1660 | + } */ |
1582 | 1661 | |
1583 | 1662 | // anchors |
1584 | 1663 | //Query anchors = multiplySpans(nostem,0,fields.anchor(),ANCHOR_BOOST); |
1585 | 1664 | |
1586 | | - finalQuery.add(bq,Occur.MUST); |
1587 | | - finalQuery.add(additional,Occur.SHOULD); |
| 1665 | + coreQuery.add(bq,Occur.MUST); |
| 1666 | + coreQuery.add(additional,Occur.SHOULD); |
1588 | 1667 | //if(anchors != null) |
1589 | 1668 | // finalQuery.add(anchors,Occur.SHOULD); |
1590 | 1669 | |
1591 | | - return finalQuery; |
| 1670 | + return coreQuery; |
| 1671 | + //return new CustomBoostQuery(coreQuery,boostQuery); |
1592 | 1672 | |
1593 | 1673 | } |
1594 | 1674 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java |
— | — | @@ -15,9 +15,11 @@ |
16 | 16 | import org.apache.lucene.analysis.ru.RussianStemFilter; |
17 | 17 | import org.apache.lucene.analysis.th.ThaiWordFilter; |
18 | 18 | import org.apache.lucene.search.FieldSortedHitQueue; |
| 19 | +import org.wikimedia.lsearch.beans.Title; |
19 | 20 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
20 | 21 | import org.wikimedia.lsearch.config.IndexId; |
21 | 22 | import org.wikimedia.lsearch.index.WikiIndexModifier; |
| 23 | +import org.wikimedia.lsearch.ranks.Links; |
22 | 24 | import org.wikimedia.lsearch.related.RelatedTitle; |
23 | 25 | import org.wikimedia.lsearch.test.AliasPorterStemFilter; |
24 | 26 | |
— | — | @@ -54,12 +56,13 @@ |
55 | 57 | * @param languageAnalyzer language filter class (e.g. PorterStemFilter) |
56 | 58 | * @return {PerFieldAnalyzerWrapper,WikiTokenizer} |
57 | 59 | */ |
58 | | - public static Object[] getIndexerAnalyzer(String text, FieldBuilder builder, ArrayList<String> redirects, ArrayList<String> anchors, ArrayList<RelatedTitle> related, int[] relatedPartition) { |
| 60 | + public static Object[] getIndexerAnalyzer(String text, FieldBuilder builder, ArrayList<String> redirects, ArrayList<String> anchors, |
| 61 | + ArrayList<RelatedTitle> related, int[] relatedPartition, Title title, Links links) { |
59 | 62 | PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); |
60 | 63 | WikiTokenizer tokenizer = null; |
61 | 64 | for(FieldBuilder.BuilderSet bs : builder.getBuilders()){ |
62 | 65 | tokenizer = addFieldsForIndexing(perFieldAnalyzer, text, bs.getFilters(), bs.getFields(), |
63 | | - redirects, anchors, related, relatedPartition, bs.isExactCase(), bs.isAddKeywords()); |
| 66 | + redirects, anchors, related, relatedPartition, title, links, bs.isExactCase(), bs.isAddKeywords()); |
64 | 67 | } |
65 | 68 | return new Object[] {perFieldAnalyzer,tokenizer}; |
66 | 69 | } |
— | — | @@ -70,9 +73,9 @@ |
71 | 74 | */ |
72 | 75 | public static WikiTokenizer addFieldsForIndexing(PerFieldAnalyzerWrapper perFieldAnalyzer, String text, |
73 | 76 | FilterFactory filters, FieldNameFactory fields, ArrayList<String> redirects, ArrayList<String> anchors, |
74 | | - ArrayList<RelatedTitle> related, int[] relatedPartition, boolean exactCase, boolean addKeywords) { |
| 77 | + ArrayList<RelatedTitle> related, int[] relatedPartition, Title title, Links links, boolean exactCase, boolean addKeywords) { |
75 | 78 | // parse wiki-text to get categories |
76 | | - WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase); |
| 79 | + WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getIndexId(),exactCase); |
77 | 80 | tokenizer.tokenize(); |
78 | 81 | ArrayList<String> categories = tokenizer.getCategories(); |
79 | 82 | HashMap<String,String> interwiki = tokenizer.getInterwikis(); |
— | — | @@ -106,6 +109,9 @@ |
107 | 110 | // related |
108 | 111 | setRelatedAnalyzer(perFieldAnalyzer,fields.related(), |
109 | 112 | new RelatedAnalyzer(related,relatedPartition,filters.getNoStemmerFilterFactory(),fields.related(),exactCase)); |
| 113 | + // context |
| 114 | + setContextAnalyzer(perFieldAnalyzer,fields.context(), |
| 115 | + new ContextAnalyzer(title,links,related,relatedPartition,filters.getNoStemmerFilterFactory(),fields.context(),exactCase)); |
110 | 116 | return tokenizer; |
111 | 117 | } |
112 | 118 | |
— | — | @@ -126,24 +132,24 @@ |
127 | 133 | perFieldAnalyzer.addAnalyzer(prefix+i,analyzer); |
128 | 134 | } |
129 | 135 | } |
130 | | - |
131 | | - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid, boolean exactCase){ |
132 | | - if(global == null) |
133 | | - global = GlobalConfiguration.getInstance(); |
134 | | - return getSearcherAnalyzer(global.getLanguage(iid.getDBname()),exactCase); |
135 | | - |
| 136 | + |
| 137 | + protected static void setContextAnalyzer(PerFieldAnalyzerWrapper perFieldAnalyzer, String prefix, ContextAnalyzer analyzer) { |
| 138 | + for(int i=1;i<=ContextAnalyzer.CONTEXT_GROUPS;i++){ |
| 139 | + perFieldAnalyzer.addAnalyzer(prefix+i,analyzer); |
| 140 | + } |
136 | 141 | } |
137 | 142 | |
138 | | - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode){ |
139 | | - return getSearcherAnalyzer(langCode,false); |
| 143 | + |
| 144 | + public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){ |
| 145 | + return getSearcherAnalyzer(iid,false); |
140 | 146 | } |
141 | 147 | |
142 | | - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode, boolean exactCase){ |
143 | | - return getSearcherAnalyzer(new FilterFactory(langCode),new FieldNameFactory(exactCase)); |
| 148 | + public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid, boolean exactCase){ |
| 149 | + return getSearcherAnalyzer(new FilterFactory(iid),new FieldNameFactory(exactCase)); |
144 | 150 | } |
145 | 151 | |
146 | | - public static PerFieldAnalyzerWrapper getSpellCheckAnalyzer(String langCode, HashSet<String> stopWords){ |
147 | | - FilterFactory filters = new FilterFactory(langCode,FilterFactory.Type.SPELL_CHECK); |
| 152 | + public static PerFieldAnalyzerWrapper getSpellCheckAnalyzer(IndexId iid, HashSet<String> stopWords){ |
| 153 | + FilterFactory filters = new FilterFactory(iid,FilterFactory.Type.SPELL_CHECK); |
148 | 154 | filters.setStopWords(stopWords); |
149 | 155 | return getSearcherAnalyzer(filters,new FieldNameFactory()); |
150 | 156 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java |
— | — | @@ -60,6 +60,13 @@ |
61 | 61 | return "related"; |
62 | 62 | } |
63 | 63 | |
| 64 | + public String context(){ |
| 65 | + if(exactCase) |
| 66 | + return "context_exact"; |
| 67 | + else |
| 68 | + return "context"; |
| 69 | + } |
| 70 | + |
64 | 71 | public String anchor(){ |
65 | 72 | if(exactCase) |
66 | 73 | return "anchor_exact"; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java |
— | — | @@ -9,6 +9,7 @@ |
10 | 10 | import org.apache.lucene.analysis.Analyzer; |
11 | 11 | import org.apache.lucene.analysis.Token; |
12 | 12 | import org.apache.lucene.analysis.TokenStream; |
| 13 | +import org.wikimedia.lsearch.config.IndexId; |
13 | 14 | |
14 | 15 | /** |
15 | 16 | * Analyzer that builds a field with an array of keywords, |
— | — | @@ -28,6 +29,7 @@ |
29 | 30 | static Logger log = Logger.getLogger(KeywordsAnalyzer.class); |
30 | 31 | protected KeywordsTokenStream[] tokensBySize = null; |
31 | 32 | protected String prefix; |
| 33 | + protected IndexId iid; |
32 | 34 | |
33 | 35 | /** number of field to be generated, e.g. keyword1 for single-word keywords, |
34 | 36 | * keyword2 for two-word keywords, etc ... the last field has all the remaining keys |
— | — | @@ -50,6 +52,7 @@ |
51 | 53 | |
52 | 54 | protected void init(ArrayList<String> keywords, FilterFactory filters, String prefix, boolean exactCase) { |
53 | 55 | this.prefix = prefix; |
| 56 | + this.iid = filters.getIndexId(); |
54 | 57 | tokensBySize = new KeywordsTokenStream[KEYWORD_LEVELS]; |
55 | 58 | if(keywords == null){ |
56 | 59 | // init empty token streams |
— | — | @@ -63,7 +66,7 @@ |
64 | 67 | keywordsBySize.add(new ArrayList<String>()); |
65 | 68 | // arange keywords into a list by token number |
66 | 69 | for(String k : keywords){ |
67 | | - ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,filters.getLanguage(),exactCase).parse(); |
| 70 | + ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,exactCase).parse(); |
68 | 71 | if(parsed.size() == 0) |
69 | 72 | continue; |
70 | 73 | else if(parsed.size() < KEYWORD_LEVELS) |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java |
— | — | @@ -11,6 +11,7 @@ |
12 | 12 | import org.apache.lucene.analysis.de.GermanStemFilter; |
13 | 13 | import org.apache.lucene.analysis.snowball.SnowballFilter; |
14 | 14 | import org.apache.lucene.analysis.th.ThaiWordFilter; |
| 15 | +import org.wikimedia.lsearch.config.IndexId; |
15 | 16 | |
16 | 17 | /** |
17 | 18 | * Make a language-dependent pair of filters. The custom filter is to be applied before the stemmer. |
— | — | @@ -20,6 +21,7 @@ |
21 | 22 | */ |
22 | 23 | public class FilterFactory { |
23 | 24 | protected String lang; |
| 25 | + protected IndexId iid; |
24 | 26 | protected String snowballName = null; |
25 | 27 | protected boolean useStemmer,useLangFilter; |
26 | 28 | protected Class stemmer = null; |
— | — | @@ -33,18 +35,20 @@ |
34 | 36 | public enum Type { FULL, NO_STEM, SPELL_CHECK }; |
35 | 37 | protected Type type = null; |
36 | 38 | |
37 | | - public FilterFactory(String lang){ |
38 | | - this(lang,Type.FULL); |
| 39 | + public FilterFactory(IndexId iid){ |
| 40 | + this(iid,Type.FULL); |
39 | 41 | } |
40 | 42 | |
41 | | - public FilterFactory(String lang, Type type){ |
42 | | - this.lang = lang; |
| 43 | + public FilterFactory(IndexId iid, Type type){ |
| 44 | + this.lang = iid.getLangCode(); |
| 45 | + this.iid = iid; |
43 | 46 | this.type = type; |
44 | 47 | init(); |
45 | | - noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useLangFilter,null,langFilter,additionalFilters); |
| 48 | + noStemmerFilterFactory = new FilterFactory(iid,lang,snowballName,false,useLangFilter,null,langFilter,additionalFilters); |
46 | 49 | } |
47 | 50 | |
48 | | - public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useLangFilter, Class stemmer, Class langFilter, ArrayList<Class> additionalFilters) { |
| 51 | + public FilterFactory(IndexId iid, String lang, String snowballName, boolean useStemmer, boolean useLangFilter, Class stemmer, Class langFilter, ArrayList<Class> additionalFilters) { |
| 52 | + this.iid = iid; |
49 | 53 | this.lang = lang; |
50 | 54 | this.snowballName = snowballName; |
51 | 55 | this.useStemmer = useStemmer; |
— | — | @@ -193,6 +197,12 @@ |
194 | 198 | public void setStopWords(Set<String> stopWords){ |
195 | 199 | this.stopWords = stopWords; |
196 | 200 | } |
| 201 | + |
| 202 | + public IndexId getIndexId() { |
| 203 | + return iid; |
| 204 | + } |
197 | 205 | |
198 | 206 | |
| 207 | + |
| 208 | + |
199 | 209 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java |
— | — | @@ -25,7 +25,7 @@ |
26 | 26 | */ |
27 | 27 | @Override |
28 | 28 | public TokenStream tokenStream(String fieldName, String text) { |
29 | | - wikitokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase); |
| 29 | + wikitokenizer = new WikiTokenizer(text,filters.getIndexId(),exactCase); |
30 | 30 | return super.tokenStream(fieldName,(Reader)null); |
31 | 31 | } |
32 | 32 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/RelatedAnalyzer.java |
— | — | @@ -21,6 +21,7 @@ |
22 | 22 | |
23 | 23 | public RelatedAnalyzer(ArrayList<RelatedTitle> related, int[] p, FilterFactory filters, String prefix, boolean exactCase) { |
24 | 24 | this.prefix = prefix; |
| 25 | + this.iid = filters.getIndexId(); |
25 | 26 | tokensBySize = new KeywordsTokenStream[RELATED_GROUPS]; |
26 | 27 | if(related == null || p == null){ |
27 | 28 | // init empty token streams |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ContextAnalyzer.java |
— | — | @@ -0,0 +1,60 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.Collection; |
| 7 | + |
| 8 | +import org.wikimedia.lsearch.analyzers.KeywordsAnalyzer.KeywordsTokenStream; |
| 9 | +import org.wikimedia.lsearch.beans.Title; |
| 10 | +import org.wikimedia.lsearch.ranks.Links; |
| 11 | +import org.wikimedia.lsearch.related.RelatedTitle; |
| 12 | + |
| 13 | +/** |
| 14 | + * Contexts tokenized, with token gaps |
| 15 | + * |
| 16 | + * @author rainman |
| 17 | + * |
| 18 | + */ |
| 19 | +public class ContextAnalyzer extends KeywordsAnalyzer { |
| 20 | + static public int CONTEXT_GROUPS = 2; |
| 21 | + |
| 22 | + static public int TOKEN_GAP = 100; |
| 23 | + |
| 24 | + public ContextAnalyzer(Title title, Links links, ArrayList<RelatedTitle> related, int[] p, FilterFactory filters, String prefix, boolean exactCase) { |
| 25 | + this.prefix = prefix; |
| 26 | + this.iid = filters.getIndexId(); |
| 27 | + tokensBySize = new KeywordsTokenStream[CONTEXT_GROUPS]; |
| 28 | + if(related == null || p == null || title == null || links == null){ |
| 29 | + // init empty token streams |
| 30 | + for(int i=0; i< CONTEXT_GROUPS; i++){ |
| 31 | + tokensBySize[i] = new KeywordsTokenStream(null,filters,exactCase,TOKEN_GAP); |
| 32 | + } |
| 33 | + return; |
| 34 | + } |
| 35 | + String key = title.getKey(); |
| 36 | + // split-up |
| 37 | + ArrayList<ArrayList<String>> partitions = new ArrayList<ArrayList<String>>(); |
| 38 | + for(int i=0;i<CONTEXT_GROUPS;i++){ |
| 39 | + ArrayList<String> part = new ArrayList<String>(); |
| 40 | + for(int j=p[i];j<p[i+1];j++){ |
| 41 | + Title t = related.get(j).getRelated(); |
| 42 | + Collection<String> contexts; |
| 43 | + try { |
| 44 | + contexts = links.getContext(t.getKey(),key); |
| 45 | + //System.out.println("CONTEXT "+t.getKey()+" -> "+key+" : "+contexts); |
| 46 | + if(contexts != null) |
| 47 | + part.addAll(contexts); |
| 48 | + } catch (IOException e) { |
| 49 | + log.warn("Cannot fetch context for "+key+" from "+t.getKey()+" : "+e.getMessage()); |
| 50 | + e.printStackTrace(); |
| 51 | + } |
| 52 | + |
| 53 | + } |
| 54 | + partitions.add(part); |
| 55 | + } |
| 56 | + for(int i=0; i< CONTEXT_GROUPS; i++){ |
| 57 | + tokensBySize[i] = new KeywordsTokenStream(partitions.get(i),filters,exactCase,TOKEN_GAP); |
| 58 | + } |
| 59 | + } |
| 60 | + |
| 61 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java |
— | — | @@ -1,5 +1,7 @@ |
2 | 2 | package org.wikimedia.lsearch.analyzers; |
3 | 3 | |
| 4 | +import org.wikimedia.lsearch.config.IndexId; |
| 5 | + |
4 | 6 | /** |
5 | 7 | * Agregate class for FilterFactory and FieldNameFactory. This class |
6 | 8 | * contains methods used to build various fields of the index, |
— | — | @@ -47,15 +49,15 @@ |
48 | 50 | public static enum Options { NONE, SPELL_CHECK }; |
49 | 51 | |
50 | 52 | /** Construct case-insensitive field builder with stemming */ |
51 | | - public FieldBuilder(String lang){ |
52 | | - this(lang,Case.IGNORE_CASE,Stemmer.USE_STEMMER,Options.NONE); |
| 53 | + public FieldBuilder(IndexId iid){ |
| 54 | + this(iid,Case.IGNORE_CASE,Stemmer.USE_STEMMER,Options.NONE); |
53 | 55 | } |
54 | 56 | |
55 | | - public FieldBuilder(String lang, Case useCase){ |
56 | | - this(lang,useCase,Stemmer.USE_STEMMER,Options.NONE); |
| 57 | + public FieldBuilder(IndexId iid, Case useCase){ |
| 58 | + this(iid,useCase,Stemmer.USE_STEMMER,Options.NONE); |
57 | 59 | } |
58 | 60 | |
59 | | - public FieldBuilder(String lang, Case useCase, Stemmer useStemmer, Options options){ |
| 61 | + public FieldBuilder(IndexId iid, Case useCase, Stemmer useStemmer, Options options){ |
60 | 62 | FilterFactory.Type type = FilterFactory.Type.FULL; |
61 | 63 | if(options == Options.SPELL_CHECK) |
62 | 64 | type = FilterFactory.Type.SPELL_CHECK; |
— | — | @@ -63,7 +65,7 @@ |
64 | 66 | if(useCase == Case.EXACT_CASE){ |
65 | 67 | builders = new BuilderSet[2]; |
66 | 68 | builders[1] = new BuilderSet( |
67 | | - new FilterFactory(lang,type).getNoStemmerFilterFactory(), |
| 69 | + new FilterFactory(iid,type).getNoStemmerFilterFactory(), |
68 | 70 | new FieldNameFactory(FieldNameFactory.EXACT_CASE)); |
69 | 71 | } else |
70 | 72 | builders = new BuilderSet[1]; |
— | — | @@ -71,11 +73,11 @@ |
72 | 74 | // default factory, lowercase all data |
73 | 75 | if(useStemmer == Stemmer.USE_STEMMER){ |
74 | 76 | builders[0] = new BuilderSet( |
75 | | - new FilterFactory(lang,type), |
| 77 | + new FilterFactory(iid,type), |
76 | 78 | new FieldNameFactory()); |
77 | 79 | } else{ |
78 | 80 | builders[0] = new BuilderSet( |
79 | | - new FilterFactory(lang,type).getNoStemmerFilterFactory(), |
| 81 | + new FilterFactory(iid,type).getNoStemmerFilterFactory(), |
80 | 82 | new FieldNameFactory()); |
81 | 83 | } |
82 | 84 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java |
— | — | @@ -10,6 +10,7 @@ |
11 | 11 | import org.apache.log4j.Logger; |
12 | 12 | import org.apache.lucene.analysis.Token; |
13 | 13 | import org.apache.lucene.analysis.Tokenizer; |
| 14 | +import org.wikimedia.lsearch.config.IndexId; |
14 | 15 | |
15 | 16 | /** Uses FastWikiTokenizerEngine to tokenize text */ |
16 | 17 | public class WikiTokenizer extends Tokenizer { |
— | — | @@ -36,8 +37,8 @@ |
37 | 38 | * @param str |
38 | 39 | */ |
39 | 40 | |
40 | | - public WikiTokenizer(String str, String lang, boolean exactCase){ |
41 | | - parser = new FastWikiTokenizerEngine(str,lang,exactCase); |
| 41 | + public WikiTokenizer(String str, IndexId iid, boolean exactCase){ |
| 42 | + parser = new FastWikiTokenizerEngine(str,iid,exactCase); |
42 | 43 | this.input = null; |
43 | 44 | } |
44 | 45 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SplitAnalyzer.java |
— | — | @@ -10,7 +10,7 @@ |
11 | 11 | import org.apache.lucene.analysis.Tokenizer; |
12 | 12 | import org.wikimedia.lsearch.ranks.StringList; |
13 | 13 | |
14 | | -/** Split the text by some specific char */ |
| 14 | +/** Analyzes serialized StringLists into its components */ |
15 | 15 | public class SplitAnalyzer extends Analyzer { |
16 | 16 | class SplitTokenStream extends Tokenizer { |
17 | 17 | Iterator<String> it = null; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -9,6 +9,7 @@ |
10 | 10 | |
11 | 11 | import org.apache.commons.lang.WordUtils; |
12 | 12 | import org.apache.lucene.analysis.Token; |
| 13 | +import org.wikimedia.lsearch.config.IndexId; |
13 | 14 | import org.wikimedia.lsearch.util.Localization; |
14 | 15 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
15 | 16 | |
— | — | @@ -67,6 +68,7 @@ |
68 | 69 | |
69 | 70 | /** language code */ |
70 | 71 | private String language; |
| 72 | + private IndexId iid; |
71 | 73 | /** language code -> set (image namespace names) */ |
72 | 74 | private static Hashtable<String,HashSet<String>> imageLocalized = new Hashtable<String,HashSet<String>>(); |
73 | 75 | /** language code -> set (category namespace names) */ |
— | — | @@ -111,10 +113,11 @@ |
112 | 114 | } |
113 | 115 | } |
114 | 116 | |
115 | | - public FastWikiTokenizerEngine(String text, String lang, boolean exactCase){ |
| 117 | + public FastWikiTokenizerEngine(String text, IndexId iid, boolean exactCase){ |
116 | 118 | this.text = text.toCharArray(); |
117 | 119 | this.textString = text; |
118 | | - this.language = lang; |
| 120 | + this.language = iid.getLangCode(); |
| 121 | + this.iid = iid; |
119 | 122 | this.exactCase = exactCase; |
120 | 123 | textLength = text.length(); |
121 | 124 | init(); |
— | — | @@ -744,7 +747,7 @@ |
745 | 748 | else if(language!=null && language.length()!=0){ |
746 | 749 | HashSet<String> loc = imageLocalized.get(language); |
747 | 750 | if(loc == null){ |
748 | | - loc = Localization.getLocalizedImage(language); |
| 751 | + loc = Localization.getLocalizedImage(language,iid.getDBname()); |
749 | 752 | imageLocalized.put(language,loc); |
750 | 753 | } |
751 | 754 | if(loc.contains(prefix)) |
— | — | @@ -761,7 +764,7 @@ |
762 | 765 | else if(language!=null && language.length()!=0){ |
763 | 766 | HashSet<String> loc = categoryLocalized.get(language); |
764 | 767 | if(loc == null){ |
765 | | - loc = Localization.getLocalizedCategory(language); |
| 768 | + loc = Localization.getLocalizedCategory(language,iid.getDBname()); |
766 | 769 | categoryLocalized.put(language,loc); |
767 | 770 | } |
768 | 771 | if(loc.contains(prefix)) |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Localization.java |
— | — | @@ -8,6 +8,7 @@ |
9 | 9 | import java.util.HashMap; |
10 | 10 | import java.util.Hashtable; |
11 | 11 | import java.util.HashSet; |
| 12 | +import java.util.Map; |
12 | 13 | import java.util.Map.Entry; |
13 | 14 | |
14 | 15 | import org.apache.log4j.Logger; |
— | — | @@ -28,9 +29,16 @@ |
29 | 30 | protected static Object lock = new Object(); |
30 | 31 | /** Languages for which loading of localization failed */ |
31 | 32 | protected static HashSet<String> badLocalizations = new HashSet<String>(); |
| 33 | + /** Languages for which we loaded localication */ |
| 34 | + protected static HashSet<String> loadedLocalizations = new HashSet<String>(); |
32 | 35 | protected static HashSet<String> interwiki = null; |
33 | 36 | /** lowecased canonical names of namespaces */ |
34 | | - protected static Hashtable<String,Integer> canonicalNamespaces = new Hashtable<String,Integer>(); |
| 37 | + protected static Hashtable<String,Integer> canonicalNamespaces = new Hashtable<String,Integer>(); |
| 38 | + /** dbname -> meta namespace name */ |
| 39 | + protected static Hashtable<String,String> metaNamespaces = new Hashtable<String,String>(); |
| 40 | + /** custom maps (for oai headers, etc..) dbname -> nsname -> nsindex */ |
| 41 | + protected static Hashtable<String,Hashtable<String,Integer>> customNamespaces = new Hashtable<String,Hashtable<String,Integer>>(); |
| 42 | + |
35 | 43 | static{ |
36 | 44 | canonicalNamespaces.put("media",-2); |
37 | 45 | canonicalNamespaces.put("special",-1); |
— | — | @@ -51,48 +59,72 @@ |
52 | 60 | canonicalNamespaces.put("category_talk",15); |
53 | 61 | } |
54 | 62 | |
| 63 | + /** set meta namespaces for specific db names */ |
| 64 | + public static void setMetaNamespace(Map<String,String> dbmeta){ |
| 65 | + synchronized(lock){ |
| 66 | + metaNamespaces.putAll(dbmeta); |
| 67 | + } |
| 68 | + } |
| 69 | + |
55 | 70 | /** Add custom mapping not found in localization files from other source, e.g. project name, etc.. */ |
56 | | - public static void addCustomMapping(String namespace, int index, String langCode){ |
| 71 | + public static void addCustomMapping(String namespace, int index, String dbname){ |
57 | 72 | synchronized(lock){ |
58 | | - Hashtable<String,Integer> map = namespaces.get(langCode); |
| 73 | + Hashtable<String,Integer> map = customNamespaces.get(dbname); |
59 | 74 | if(map == null){ |
60 | 75 | map = new Hashtable<String,Integer>(); |
61 | | - namespaces.put(langCode,map); |
| 76 | + customNamespaces.put(dbname,map); |
62 | 77 | } |
63 | 78 | map.put(namespace.toLowerCase(),index); |
64 | 79 | } |
65 | 80 | } |
66 | | - |
67 | | - public static HashSet<String> getLocalizedImage(String langCode){ |
68 | | - return getLocalizedNamespace(langCode,6); |
| 81 | + /** Get a new hashset of localized image namespace names */ |
| 82 | + public static HashSet<String> getLocalizedImage(String langCode, String dbname){ |
| 83 | + return getLocalizedNamespace(langCode,6,dbname); |
69 | 84 | } |
70 | | - |
71 | | - public static HashSet<String> getLocalizedCategory(String langCode){ |
72 | | - return getLocalizedNamespace(langCode,14); |
| 85 | + /** Get a new hashset of localized category namespace names */ |
| 86 | + public static HashSet<String> getLocalizedCategory(String langCode, String dbname){ |
| 87 | + return getLocalizedNamespace(langCode,14,dbname); |
73 | 88 | } |
74 | 89 | |
75 | | - public static HashSet<String> getLocalizedNamespace(String langCode, int nsId){ |
| 90 | + public static HashSet<String> getLocalizedNamespace(String langCode, int nsId, String dbname){ |
76 | 91 | synchronized (lock){ |
| 92 | + HashSet<String> res = new HashSet<String>(); |
77 | 93 | langCode = langCode.toLowerCase(); |
78 | | - if(namespaces.get(langCode)==null){ |
79 | | - if(badLocalizations.contains(langCode) || !readLocalization(langCode)) |
80 | | - return new HashSet<String>(); |
| 94 | + if(namespaces.get(langCode)==null) |
| 95 | + readLocalization(langCode); |
| 96 | + |
| 97 | + // get namespaces from message files |
| 98 | + res.addAll(collect(namespaces.get(langCode),nsId)); |
| 99 | + // get db-specific names, like meta namespaces or ones obtained via oai or other ways |
| 100 | + if(dbname != null){ |
| 101 | + res.addAll(collect(customNamespaces.get(dbname),nsId)); |
| 102 | + if(nsId == 4 && metaNamespaces.containsKey(dbname)) |
| 103 | + res.add(metaNamespaces.get(dbname)); |
81 | 104 | } |
82 | | - return collect(namespaces.get(langCode),nsId); |
| 105 | + return res; |
83 | 106 | } |
84 | 107 | } |
85 | 108 | |
86 | 109 | /** Get mapping namespace_name (lowercase) -> namespace_index */ |
87 | | - public static HashMap<String,Integer> getLocalizedNamespaces(String langCode){ |
| 110 | + public static HashMap<String,Integer> getLocalizedNamespaces(String langCode, String dbname){ |
88 | 111 | synchronized (lock){ |
89 | 112 | HashMap<String,Integer> ret = new HashMap<String,Integer>(); |
90 | 113 | ret.putAll(canonicalNamespaces); |
91 | 114 | langCode = langCode.toLowerCase(); |
92 | | - if(namespaces.get(langCode)==null){ |
93 | | - if(badLocalizations.contains(langCode) || !readLocalization(langCode)) |
94 | | - return ret; |
| 115 | + if(namespaces.get(langCode)==null) |
| 116 | + readLocalization(langCode); |
| 117 | + // localization from messages files |
| 118 | + if(namespaces.containsKey(langCode)) |
| 119 | + ret.putAll(namespaces.get(langCode)); |
| 120 | + // db-specific |
| 121 | + if(dbname != null){ |
| 122 | + // meta namespaces |
| 123 | + if(metaNamespaces.containsKey(dbname)) |
| 124 | + ret.put(metaNamespaces.get(dbname),4); |
| 125 | + // custom |
| 126 | + if(customNamespaces.containsKey(dbname)) |
| 127 | + ret.putAll(customNamespaces.get(dbname)); |
95 | 128 | } |
96 | | - ret.putAll(namespaces.get(langCode)); |
97 | 129 | return ret; |
98 | 130 | } |
99 | 131 | } |
— | — | @@ -107,6 +139,8 @@ |
108 | 140 | /** Collect all the names with some certain namespace id */ |
109 | 141 | protected static HashSet<String> collect(Hashtable<String,Integer> ns, int nsid) { |
110 | 142 | HashSet<String> ret = new HashSet<String>(); |
| 143 | + if(ns == null) |
| 144 | + return ret; |
111 | 145 | for(Entry<String,Integer> e : ns.entrySet()){ |
112 | 146 | if(e.getValue().intValue() == nsid) |
113 | 147 | ret.add(e.getKey()); |
— | — | @@ -123,6 +157,10 @@ |
124 | 158 | /** Level is recursion level (to detect infinite recursion if language |
125 | 159 | * defines itself as a fallback) */ |
126 | 160 | protected static boolean readLocalization(String langCode, int level){ |
| 161 | + if(badLocalizations.contains(langCode)) |
| 162 | + return false; // failed previously |
| 163 | + if(loadedLocalizations.contains(langCode)) |
| 164 | + return true; // already loaded |
127 | 165 | Configuration config = Configuration.open(); |
128 | 166 | if(langCode == null || langCode.equals("")) |
129 | 167 | return false; |
— | — | @@ -158,6 +196,7 @@ |
159 | 197 | if(ns!=null && ns.size()!=0){ |
160 | 198 | namespaces.put(langCode.toLowerCase(),ns); |
161 | 199 | log.debug("Succesfully loaded localization for "+langCode.toLowerCase()); |
| 200 | + loadedLocalizations.add(langCode); |
162 | 201 | return true; |
163 | 202 | } else{ // maybe a fallback language is defines instead |
164 | 203 | String fallback = parser.getFallBack(text); |
— | — | @@ -165,6 +204,7 @@ |
166 | 205 | fallback = fallback.replace('-','_'); |
167 | 206 | boolean succ = readLocalization(fallback,level+1); |
168 | 207 | if(succ){ |
| 208 | + loadedLocalizations.add(fallback); |
169 | 209 | namespaces.put(langCode.toLowerCase(),namespaces.get(fallback.toLowerCase())); |
170 | 210 | redirects.put(langCode.toLowerCase(),redirects.get(fallback.toLowerCase())); |
171 | 211 | } |
— | — | @@ -216,9 +256,13 @@ |
217 | 257 | int end = line.indexOf("]]"); |
218 | 258 | if(begin != -1 && end != -1 && end > begin){ |
219 | 259 | String redirectText = text.substring(begin+2,end); |
| 260 | + int pipe = redirectText.indexOf('|'); |
| 261 | + if(pipe != -1) |
| 262 | + redirectText = redirectText.substring(0,pipe); |
220 | 263 | int fragment = redirectText.lastIndexOf('#'); |
221 | 264 | if(fragment != -1) |
222 | 265 | redirectText = redirectText.substring(0,fragment); |
| 266 | + redirectText = redirectText.replace('_',' '); |
223 | 267 | return redirectText; |
224 | 268 | } |
225 | 269 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/FSUtils.java |
— | — | @@ -0,0 +1,139 @@ |
| 2 | +package org.wikimedia.lsearch.util; |
| 3 | + |
| 4 | +import java.io.File; |
| 5 | +import java.io.IOException; |
| 6 | + |
| 7 | +/** |
| 8 | + * Various abstraction of file system operations: delete dirs, |
| 9 | + * make soft/hard links ... |
| 10 | + * |
| 11 | + * Based on FileUtil.java from Lucene Hadoop project (Apache Licence) |
| 12 | + * @author rainman |
| 13 | + * |
| 14 | + */ |
| 15 | +public class FSUtils { |
| 16 | + public static final String PATH_SEP = System.getProperty("file.separator"); |
| 17 | + |
| 18 | + enum OSType { OS_TYPE_UNIX, OS_TYPE_WINXP }; |
| 19 | + |
| 20 | + protected static String[] hardLinkCommand; |
| 21 | + |
| 22 | + static { |
| 23 | + switch(getOSType()) { |
| 24 | + case OS_TYPE_WINXP: |
| 25 | + hardLinkCommand = new String[] {"fsutil","hardlink","create", null, null}; |
| 26 | + break; |
| 27 | + case OS_TYPE_UNIX: |
| 28 | + default: |
| 29 | + hardLinkCommand = new String[] {"ln", null, null}; |
| 30 | + } |
| 31 | + } |
| 32 | + |
| 33 | + static OSType getOSType() { |
| 34 | + String osName = System.getProperty("os.name"); |
| 35 | + if (osName.indexOf("Windows") >= 0 && |
| 36 | + (osName.indexOf("XP") >= 0 || osName.indexOf("2003") >= 0)) |
| 37 | + return OSType.OS_TYPE_WINXP; |
| 38 | + else |
| 39 | + return OSType.OS_TYPE_UNIX; |
| 40 | + } |
| 41 | + |
| 42 | + /** |
| 43 | + * Create a hardlink in the filesystem. |
| 44 | + * |
| 45 | + * @param target |
| 46 | + * @param linkName |
| 47 | + * @throws IOException |
| 48 | + */ |
| 49 | + public static void createHardLink(File target, File linkName) throws IOException { |
| 50 | + int len = hardLinkCommand.length; |
| 51 | + hardLinkCommand[len-2] = target.getCanonicalPath(); |
| 52 | + hardLinkCommand[len-1] = linkName.getCanonicalPath(); |
| 53 | + Command.exec(hardLinkCommand); |
| 54 | + } |
| 55 | + |
| 56 | + /** |
| 57 | + * Create hard links recursively if the target is a directory |
| 58 | + * |
| 59 | + * @param target |
| 60 | + * @param linkname |
| 61 | + * @throws IOException |
| 62 | + */ |
| 63 | + public static void createHardLinkRecursive(String target, String linkname) throws IOException { |
| 64 | + File file = new File(target); |
| 65 | + if(!file.exists()) |
| 66 | + throw new IOException("Trying to hardlink nonexisting file "+target); |
| 67 | + if(file.isDirectory()){ |
| 68 | + File[] files = file.listFiles(); |
| 69 | + for(File f: files) |
| 70 | + createHardLinkRecursive(format(new String[]{target,f.getName()}),format(new String[] {linkname,f.getName()})); |
| 71 | + } else |
| 72 | + createHardLink(new File(target),new File(linkname)); |
| 73 | + } |
| 74 | + |
| 75 | + |
| 76 | + /** |
| 77 | + * Create a soft link between a src and destination |
| 78 | + * only on a local disk. HDFS does not support this |
| 79 | + * @param target the target for symlink |
| 80 | + * @param linkname the symlink |
| 81 | + */ |
| 82 | + public static void createSymLink(String target, String linkname) throws IOException{ |
| 83 | + String cmd = "ln -s " + target + " " + linkname; |
| 84 | + Command.exec(cmd); |
| 85 | + } |
| 86 | + |
| 87 | + /** |
| 88 | + * Append path parts via the systems path separator. |
| 89 | + * I.e. {"/usr/local", "search" } -> "/usr/local/search" |
| 90 | + * @param parts |
| 91 | + */ |
| 92 | + public static String format(String[] parts){ |
| 93 | + StringBuilder sb = new StringBuilder(); |
| 94 | + boolean first = true; |
| 95 | + for(String p : parts){ |
| 96 | + if(!first && p.startsWith(PATH_SEP)) |
| 97 | + p = p.substring(PATH_SEP.length()); |
| 98 | + sb.append(p); |
| 99 | + if(!p.endsWith(PATH_SEP)) |
| 100 | + sb.append(PATH_SEP); |
| 101 | + if(first) |
| 102 | + first = false; |
| 103 | + } |
| 104 | + return sb.toString(); |
| 105 | + } |
| 106 | + |
| 107 | + /** |
| 108 | + * Construct a file from parts of path |
| 109 | + * @param parts |
| 110 | + */ |
| 111 | + public static File formatFile(String[] parts){ |
| 112 | + return new File(format(parts)); |
| 113 | + } |
| 114 | + |
| 115 | + /** |
| 116 | + * Delete a file recursively |
| 117 | + * |
| 118 | + * @param file |
| 119 | + */ |
| 120 | + public static void deleteRecursive(File file){ |
| 121 | + if(!file.exists()) |
| 122 | + return; |
| 123 | + else if(file.isDirectory()){ |
| 124 | + File[] files = file.listFiles(); |
| 125 | + for(File f: files) |
| 126 | + deleteRecursive(f); |
| 127 | + file.delete(); |
| 128 | + } else{ |
| 129 | + file.delete(); |
| 130 | + } |
| 131 | + } |
| 132 | + |
| 133 | + /** Delete single file */ |
| 134 | + public static void delete(String path) { |
| 135 | + File f = new File(path); |
| 136 | + if(f.exists()) // if doesn't exist don't complain |
| 137 | + f.delete(); |
| 138 | + } |
| 139 | + |
| 140 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Command.java |
— | — | @@ -24,10 +24,17 @@ |
25 | 25 | } |
26 | 26 | |
27 | 27 | public static void exec(String command) throws IOException { |
| 28 | + exec(new String[] {command}); |
| 29 | + } |
| 30 | + |
| 31 | + public static void exec(String[] command) throws IOException { |
28 | 32 | Process p = null; |
29 | 33 | log.debug("Executing shell command "+command); |
30 | 34 | try { |
31 | | - p = Runtime.getRuntime().exec(command); |
| 35 | + if(command.length == 1) |
| 36 | + p = Runtime.getRuntime().exec(command[0]); |
| 37 | + else |
| 38 | + p = Runtime.getRuntime().exec(command); |
32 | 39 | p.waitFor(); |
33 | 40 | if(p.exitValue()!=0){ |
34 | 41 | log.warn("Got exit value "+p.exitValue()+" while executing "+command); |
— | — | @@ -43,6 +50,8 @@ |
44 | 51 | throw new IOException("Interrupted"); |
45 | 52 | } finally { |
46 | 53 | closeStreams(p); |
| 54 | + if(p != null) |
| 55 | + p.destroy(); |
47 | 56 | } |
48 | 57 | } |
49 | 58 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/PHPParser.java |
— | — | @@ -162,6 +162,24 @@ |
163 | 163 | return servers; |
164 | 164 | } |
165 | 165 | |
| 166 | + /** Get wgMetaNamespace (dbname->metans name) from InitialiseSettings */ |
| 167 | + public Hashtable<String,String> getMetaNamespace(String text){ |
| 168 | + text = text.replaceAll("(#.*)",""); // strip comments |
| 169 | + Hashtable<String,String> meta = new Hashtable<String,String>(); |
| 170 | + |
| 171 | + int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL; |
| 172 | + Pattern wgmeta = Pattern.compile("[\"']wgMetaNamespace[\"']\\s*=>\\s*array\\s*\\((.*?)\\)",flags); |
| 173 | + Pattern entry = Pattern.compile("[\"'](.*?)[\"']\\s*=>\\s*[\"'](.*?)[\"']",flags); |
| 174 | + Matcher matcher = wgmeta.matcher(text); |
| 175 | + while(matcher.find()){ |
| 176 | + Matcher me = entry.matcher(matcher.group(1)); |
| 177 | + while(me.find()){ |
| 178 | + meta.put(me.group(1),me.group(2)); |
| 179 | + } |
| 180 | + } |
| 181 | + return meta; |
| 182 | + } |
| 183 | + |
166 | 184 | /** Get wgNamespacesToBeSearchedDefault from InitialiseSettings */ |
167 | 185 | public Hashtable<String,NamespaceFilter> getDefaultSearch(String text){ |
168 | 186 | text = text.replaceAll("(#.*)",""); // strip comments |
— | — | @@ -276,6 +294,7 @@ |
277 | 295 | System.out.println(p.getLanguages(initset)); |
278 | 296 | System.out.println(p.getServer(initset)); |
279 | 297 | System.out.println(p.getDefaultSearch(initset)); |
| 298 | + System.out.println(p.getMetaNamespace(initset)); |
280 | 299 | |
281 | 300 | |
282 | 301 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/TitleReader.java |
— | — | @@ -1,67 +0,0 @@ |
2 | | -package org.wikimedia.lsearch.ranks; |
3 | | - |
4 | | -import java.io.IOException; |
5 | | -import java.util.ArrayList; |
6 | | -import java.util.HashMap; |
7 | | -import java.util.HashSet; |
8 | | -import java.util.Iterator; |
9 | | -import java.util.Map.Entry; |
10 | | - |
11 | | -import org.mediawiki.importer.DumpWriter; |
12 | | -import org.mediawiki.importer.Page; |
13 | | -import org.mediawiki.importer.Revision; |
14 | | -import org.mediawiki.importer.Siteinfo; |
15 | | -import org.wikimedia.lsearch.beans.ArticleLinks; |
16 | | -import org.wikimedia.lsearch.beans.Title; |
17 | | -import org.wikimedia.lsearch.config.IndexId; |
18 | | -import org.wikimedia.lsearch.util.Localization; |
19 | | - |
20 | | -/** |
21 | | - * Read a HashSet of titles from dump |
22 | | - * |
23 | | - * @author rainman |
24 | | - * |
25 | | - */ |
26 | | -public class TitleReader implements DumpWriter{ |
27 | | - Page page; |
28 | | - Revision revision; |
29 | | - Links links; |
30 | | - protected String langCode; |
31 | | - |
32 | | - public TitleReader(String langCode, IndexId iid) throws IOException{ |
33 | | - this.langCode = langCode; |
34 | | - this.links = Links.createNew(iid); |
35 | | - } |
36 | | - |
37 | | - public void writeRevision(Revision revision) throws IOException { |
38 | | - this.revision = revision; |
39 | | - } |
40 | | - public void writeStartPage(Page page) throws IOException { |
41 | | - this.page = page; |
42 | | - } |
43 | | - public void writeEndPage() throws IOException { |
44 | | - String key = page.Title.Namespace+":"+page.Title.Text; |
45 | | - links.addTitle(new Title(key)); |
46 | | - } |
47 | | - public Links getLinks() { |
48 | | - return links; |
49 | | - } |
50 | | - public void close() throws IOException { |
51 | | - // nop |
52 | | - } |
53 | | - public void writeEndWiki() throws IOException { |
54 | | - // nop |
55 | | - } |
56 | | - public void writeSiteinfo(Siteinfo info) throws IOException { |
57 | | - // write siteinfo to localization |
58 | | - Iterator it = info.Namespaces.orderedEntries(); |
59 | | - while(it.hasNext()){ |
60 | | - Entry<Integer,String> pair = (Entry<Integer,String>)it.next(); |
61 | | - Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode); |
62 | | - links.addToNamespaceMap(pair.getValue(),pair.getKey()); |
63 | | - } |
64 | | - } |
65 | | - public void writeStartWiki() throws IOException { |
66 | | - // nop |
67 | | - } |
68 | | -} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java |
— | — | @@ -3,6 +3,8 @@ |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.util.HashMap; |
6 | 6 | import java.util.HashSet; |
| 7 | +import java.util.Iterator; |
| 8 | +import java.util.Map.Entry; |
7 | 9 | import java.util.regex.Matcher; |
8 | 10 | import java.util.regex.Pattern; |
9 | 11 | |
— | — | @@ -35,12 +37,14 @@ |
36 | 38 | Links links; |
37 | 39 | HashSet<String> interwiki; |
38 | 40 | String langCode; |
| 41 | + IndexId iid; |
39 | 42 | |
40 | | - public LinkReader(Links links, String langCode){ |
| 43 | + public LinkReader(Links links, IndexId iid, String langCode){ |
41 | 44 | this.links = links; |
42 | 45 | if(langCode == null || langCode.equals("")) |
43 | 46 | langCode = "en"; |
44 | 47 | this.langCode = langCode; |
| 48 | + this.iid = iid; |
45 | 49 | interwiki = Localization.getInterwiki(); |
46 | 50 | } |
47 | 51 | public void writeRevision(Revision revision) throws IOException { |
— | — | @@ -50,10 +54,23 @@ |
51 | 55 | this.page = page; |
52 | 56 | } |
53 | 57 | public void writeEndPage() throws IOException { |
54 | | - links.addArticleInfo(revision.Text,new Title(page.Title.Namespace,page.Title.Text)); |
| 58 | + Title t = new Title(page.Title.Namespace,page.Title.Text); |
| 59 | + try{ |
| 60 | + links.addArticleInfo(revision.Text,t); |
| 61 | + } catch(Exception e){ |
| 62 | + log.error("Error adding article "+t+" : "+e.getMessage()); |
| 63 | + e.printStackTrace(); |
| 64 | + } |
55 | 65 | } |
56 | 66 | public void writeSiteinfo(Siteinfo info) throws IOException { |
57 | 67 | siteinfo = info; |
| 68 | + // write siteinfo to localization |
| 69 | + Iterator it = info.Namespaces.orderedEntries(); |
| 70 | + while(it.hasNext()){ |
| 71 | + Entry<Integer,String> pair = (Entry<Integer,String>)it.next(); |
| 72 | + Localization.addCustomMapping(pair.getValue(),pair.getKey(),iid.getDBname()); |
| 73 | + links.addToNamespaceMap(pair.getValue(),pair.getKey()); |
| 74 | + } |
58 | 75 | } |
59 | 76 | public void close() throws IOException { |
60 | 77 | // nop |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java |
— | — | @@ -1,6 +1,11 @@ |
2 | 2 | package org.wikimedia.lsearch.ranks; |
3 | 3 | |
| 4 | +import java.io.ByteArrayInputStream; |
| 5 | +import java.io.ByteArrayOutputStream; |
4 | 6 | import java.io.IOException; |
| 7 | +import java.io.ObjectInputStream; |
| 8 | +import java.io.ObjectOutputStream; |
| 9 | +import java.io.StringWriter; |
5 | 10 | import java.util.ArrayList; |
6 | 11 | import java.util.Collection; |
7 | 12 | import java.util.HashMap; |
— | — | @@ -15,6 +20,9 @@ |
16 | 21 | import org.apache.lucene.analysis.SimpleAnalyzer; |
17 | 22 | import org.apache.lucene.document.Document; |
18 | 23 | import org.apache.lucene.document.Field; |
| 24 | +import org.apache.lucene.document.FieldSelector; |
| 25 | +import org.apache.lucene.document.SetBasedFieldSelector; |
| 26 | +import org.apache.lucene.index.CorruptIndexException; |
19 | 27 | import org.apache.lucene.index.IndexReader; |
20 | 28 | import org.apache.lucene.index.IndexWriter; |
21 | 29 | import org.apache.lucene.index.Term; |
— | — | @@ -29,6 +37,7 @@ |
30 | 38 | import org.wikimedia.lsearch.config.IndexId; |
31 | 39 | import org.wikimedia.lsearch.index.WikiIndexModifier; |
32 | 40 | import org.wikimedia.lsearch.related.CompactArticleLinks; |
| 41 | +import org.wikimedia.lsearch.search.NamespaceFilter; |
33 | 42 | import org.wikimedia.lsearch.spell.api.Dictionary; |
34 | 43 | import org.wikimedia.lsearch.spell.api.LuceneDictionary; |
35 | 44 | import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
— | — | @@ -40,61 +49,93 @@ |
41 | 50 | protected String langCode; |
42 | 51 | protected IndexWriter writer = null; |
43 | 52 | protected HashMap<String,Integer> nsmap = null; |
44 | | - protected HashSet<String> interwiki = new HashSet<String>(); |
| 53 | + protected HashSet<String> interwiki; |
| 54 | + protected HashSet<String> categoryLocalized; |
| 55 | + protected HashSet<String> imageLocalized; |
45 | 56 | protected IndexReader reader = null; |
46 | 57 | protected String path; |
47 | | - protected enum State { MODIFIED_TITLES, FLUSHED, MODIFIED_ARTICLES, READ }; |
| 58 | + protected enum State { FLUSHED, WRITE, MODIFIED, READ }; |
48 | 59 | protected State state; |
49 | | - protected Directory directory; |
| 60 | + protected Directory directory = null; |
| 61 | + protected NamespaceFilter nsf; // default search |
| 62 | + protected ObjectCache cache; |
| 63 | + //protected ObjectCache refCache; |
| 64 | + protected FieldSelector keyOnly,redirectOnly,contextOnly,linksOnly; |
50 | 65 | |
51 | | - private Links(IndexId iid){ |
| 66 | + private Links(IndexId iid, String path, IndexWriter writer) throws CorruptIndexException, IOException{ |
| 67 | + this.writer = writer; |
| 68 | + this.path = path; |
52 | 69 | this.iid = iid; |
53 | | - this.langCode = GlobalConfiguration.getInstance().getLanguage(iid); |
| 70 | + GlobalConfiguration global = GlobalConfiguration.getInstance(); |
| 71 | + this.langCode = global.getLanguage(iid); |
| 72 | + String dbname = iid.getDBname(); |
| 73 | + nsmap = Localization.getLocalizedNamespaces(langCode,dbname); |
| 74 | + interwiki = Localization.getInterwiki(); |
| 75 | + categoryLocalized = Localization.getLocalizedCategory(langCode,dbname); |
| 76 | + imageLocalized = Localization.getLocalizedImage(langCode,dbname); |
| 77 | + state = State.FLUSHED; |
| 78 | + initWriter(writer); |
| 79 | + //reader = IndexReader.open(path); |
| 80 | + nsf = global.getDefaultNamespace(iid); |
| 81 | + cache = new ObjectCache(10000); |
| 82 | + // init cache manager |
| 83 | + /*CacheManager manager = CacheManager.create(); |
| 84 | + cache = new Cache("links", 5000, false, false, 5, 2); |
| 85 | + manager.addCache(cache); */ |
| 86 | + keyOnly = makeSelector("article_key"); |
| 87 | + redirectOnly = makeSelector("redirect"); |
| 88 | + contextOnly = makeSelector("context"); |
| 89 | + linksOnly = makeSelector("links"); |
54 | 90 | } |
55 | 91 | |
56 | | - public static Links openExisting(IndexId iid) throws IOException{ |
57 | | - Links links = new Links(iid); |
58 | | - links.path = iid.getTempPath(); |
59 | | - log.info("Using index at "+links.path); |
60 | | - links.writer = WikiIndexModifier.openForWrite(links.path,false); |
61 | | - initWriter(links.writer); |
62 | | - links.reader = IndexReader.open(links.path); |
63 | | - links.nsmap = Localization.getLocalizedNamespaces(links.langCode); |
64 | | - links.interwiki = Localization.getInterwiki(); |
65 | | - links.state = State.FLUSHED; |
66 | | - links.directory = links.writer.getDirectory(); |
67 | | - return links; |
| 92 | + protected FieldSelector makeSelector(String field){ |
| 93 | + HashSet<String> onlySet = new HashSet<String>(); |
| 94 | + onlySet.add(field); |
| 95 | + return new SetBasedFieldSelector(onlySet, new HashSet<String>()); |
68 | 96 | } |
69 | 97 | |
70 | | - private static void initWriter(IndexWriter writer) { |
71 | | - writer.setMergeFactor(20); |
72 | | - writer.setMaxBufferedDocs(500); |
73 | | - writer.setUseCompoundFile(true); |
| 98 | + private void initWriter(IndexWriter writer) { |
| 99 | + if(writer != null){ |
| 100 | + writer.setMergeFactor(20); |
| 101 | + writer.setMaxBufferedDocs(500); |
| 102 | + writer.setUseCompoundFile(true); |
| 103 | + if(directory == null) |
| 104 | + directory = writer.getDirectory(); |
| 105 | + } |
74 | 106 | } |
75 | | - |
| 107 | + |
| 108 | + /** Open the index path for updates */ |
| 109 | + public static Links openForModification(IndexId iid) throws IOException{ |
| 110 | + iid = iid.getLinks(); |
| 111 | + String path = iid.getIndexPath(); |
| 112 | + log.info("Using index at "+path); |
| 113 | + IndexWriter writer = WikiIndexModifier.openForWrite(path,false); |
| 114 | + return new Links(iid,path,writer); |
| 115 | + } |
| 116 | + |
| 117 | + /** Open index at path for reading */ |
| 118 | + public static Links openForRead(IndexId iid, String path) throws IOException { |
| 119 | + iid = iid.getLinks(); |
| 120 | + log.info("Opening for read "+path); |
| 121 | + return new Links(iid,path,null); |
| 122 | + } |
| 123 | + |
| 124 | + /** Create new in the import path */ |
76 | 125 | public static Links createNew(IndexId iid) throws IOException{ |
77 | | - Links links = new Links(iid); |
78 | | - links.path = iid.getTempPath(); |
79 | | - log.info("Making index at "+links.path); |
80 | | - links.writer = WikiIndexModifier.openForWrite(links.path,true); |
81 | | - links.reader = IndexReader.open(links.path); |
82 | | - links.nsmap = Localization.getLocalizedNamespaces(links.langCode); |
83 | | - links.interwiki = Localization.getInterwiki(); |
84 | | - links.state = State.FLUSHED; |
85 | | - links.directory = links.writer.getDirectory(); |
| 126 | + iid = iid.getLinks(); |
| 127 | + String path = iid.getImportPath(); |
| 128 | + log.info("Making index at "+path); |
| 129 | + IndexWriter writer = WikiIndexModifier.openForWrite(path,true); |
| 130 | + Links links = new Links(iid,path,writer); |
86 | 131 | return links; |
87 | 132 | } |
88 | 133 | |
| 134 | + /** Create new index in memory (RAMDirectory) */ |
89 | 135 | public static Links createNewInMemory(IndexId iid) throws IOException{ |
90 | | - Links links = new Links(iid); |
91 | | - links.path = iid.getTempPath(); |
92 | | - log.info("Making index at "+links.path); |
93 | | - links.writer = new IndexWriter(new RAMDirectory(),new SimpleAnalyzer(),true); |
94 | | - links.reader = IndexReader.open(links.path); |
95 | | - links.nsmap = Localization.getLocalizedNamespaces(links.langCode); |
96 | | - links.interwiki = Localization.getInterwiki(); |
97 | | - links.state = State.FLUSHED; |
98 | | - links.directory = links.writer.getDirectory(); |
| 136 | + iid = iid.getLinks(); |
| 137 | + log.info("Making index in memory"); |
| 138 | + IndexWriter writer = new IndexWriter(new RAMDirectory(),new SimpleAnalyzer(),true); |
| 139 | + Links links = new Links(iid,null,writer); |
99 | 140 | return links; |
100 | 141 | } |
101 | 142 | |
— | — | @@ -105,23 +146,21 @@ |
106 | 147 | } |
107 | 148 | } |
108 | 149 | |
| 150 | + /** Add a custom namespace mapping */ |
109 | 151 | public void addToNamespaceMap(String namespace, int index){ |
110 | 152 | nsmap.put(namespace.toLowerCase(),index); |
111 | 153 | } |
112 | 154 | |
113 | | - /** Write all changes, call after batch-adding of titles and articles |
| 155 | + /** Write all changes, optimize/close everything |
114 | 156 | * @throws IOException */ |
115 | 157 | public void flush() throws IOException{ |
116 | 158 | // close & optimize |
117 | | - reader.close(); |
| 159 | + if(reader != null) |
| 160 | + reader.close(); |
118 | 161 | if(writer != null){ |
119 | 162 | writer.optimize(); |
120 | 163 | writer.close(); |
121 | 164 | } |
122 | | - // reopen |
123 | | - writer = new IndexWriter(directory, new SimpleAnalyzer(), false); |
124 | | - initWriter(writer); |
125 | | - reader = IndexReader.open(path); |
126 | 165 | state = State.FLUSHED; |
127 | 166 | } |
128 | 167 | |
— | — | @@ -130,41 +169,71 @@ |
131 | 170 | * Can still read. |
132 | 171 | * @throws IOException |
133 | 172 | */ |
134 | | - public void flushForRead() throws IOException{ |
| 173 | + protected void flushForRead() throws IOException{ |
135 | 174 | // close & optimize |
136 | | - reader.close(); |
137 | | - writer.optimize(); |
138 | | - writer.close(); |
| 175 | + if(reader != null) |
| 176 | + reader.close(); |
| 177 | + if(writer != null){ |
| 178 | + writer.optimize(); |
| 179 | + writer.close(); |
| 180 | + } |
| 181 | + log.debug("Opening index reader"); |
139 | 182 | // reopen |
140 | 183 | reader = IndexReader.open(path); |
141 | 184 | writer = null; |
142 | 185 | state = State.READ; |
143 | 186 | } |
144 | 187 | |
145 | | - /** Add a title to enable proper link analysis when adding articles |
146 | | - * @throws IOException */ |
147 | | - public void addTitle(Title t) throws IOException{ |
148 | | - Document doc = new Document(); |
149 | | - doc.add(new Field("namespace",Integer.toString(t.getNamespace()),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
150 | | - doc.add(new Field("title",t.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
151 | | - doc.add(new Field("title_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
152 | | - writer.addDocument(doc); |
153 | | - state = State.MODIFIED_TITLES; |
| 188 | + /** Open the writer, and close the reader (if any) */ |
| 189 | + protected void openForWrite() throws IOException{ |
| 190 | + if(reader != null) |
| 191 | + reader.close(); |
| 192 | + if(writer == null){ |
| 193 | + if(directory == null) |
| 194 | + throw new RuntimeException("Opened for read, but trying to write"); |
| 195 | + writer = new IndexWriter(directory,new SimpleAnalyzer(),false); |
| 196 | + initWriter(writer); |
| 197 | + reader = null; |
| 198 | + state = State.WRITE; |
| 199 | + } |
154 | 200 | } |
155 | 201 | |
| 202 | + protected void ensureRead() throws IOException { |
| 203 | + if(state != State.READ) |
| 204 | + flushForRead(); |
| 205 | + } |
| 206 | + |
| 207 | + protected void ensureWrite() throws IOException { |
| 208 | + if(writer == null) |
| 209 | + openForWrite(); |
| 210 | + } |
| 211 | + |
| 212 | + /** Modify existing article links info */ |
| 213 | + public void modifyArticleInfo(String text, Title t) throws IOException{ |
| 214 | + ensureWrite(); |
| 215 | + writer.deleteDocuments(new Term("article_key",t.getKey())); |
| 216 | + addArticleInfo(text,t); |
| 217 | + } |
| 218 | + |
156 | 219 | /** Add links and other info from article |
157 | 220 | * @throws IOException */ |
158 | 221 | public void addArticleInfo(String text, Title t) throws IOException{ |
159 | | - if(state == State.MODIFIED_TITLES) |
160 | | - flush(); |
| 222 | + ensureWrite(); |
161 | 223 | Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]"); |
162 | 224 | int namespace = t.getNamespace(); |
163 | 225 | Matcher matcher = linkPat.matcher(text); |
164 | 226 | int ns; String title; |
165 | 227 | boolean escaped; |
| 228 | + |
166 | 229 | HashSet<String> pagelinks = new HashSet<String>(); |
167 | | - HashSet<String> linkkeys = new HashSet<String>(); |
| 230 | + // article link -> contexts |
| 231 | + HashMap<String,ArrayList<String>> contextMap = new HashMap<String,ArrayList<String>>(); |
168 | 232 | |
| 233 | + // use context only for namespace in default search |
| 234 | + boolean useContext = nsf.contains(t.getNamespace()); |
| 235 | + |
| 236 | + ContextParser cp = new ContextParser(text,imageLocalized,categoryLocalized,interwiki); |
| 237 | + |
169 | 238 | Title redirect = Localization.getRedirectTitle(text,langCode); |
170 | 239 | String redirectsTo = null; |
171 | 240 | if(redirect != null){ |
— | — | @@ -172,9 +241,8 @@ |
173 | 242 | } else { |
174 | 243 | while(matcher.find()){ |
175 | 244 | String link = matcher.group(1); |
176 | | - String anchor = matcher.group(2); |
177 | | - if(anchor != null && anchor.length()>1 && anchor.substring(1).equalsIgnoreCase(title(link))) |
178 | | - anchor = null; // anchor same as link text |
| 245 | + ContextParser.Context context = useContext? cp.getNext(matcher.start(1)) : null; |
| 246 | + |
179 | 247 | int fragment = link.lastIndexOf('#'); |
180 | 248 | if(fragment != -1) |
181 | 249 | link = link.substring(0,fragment); |
— | — | @@ -204,156 +272,107 @@ |
205 | 273 | } |
206 | 274 | if(ns == 0 && namespace!=0) |
207 | 275 | continue; // skip links from other namespaces into the main namespace |
208 | | - String target = findTargetLink(ns,title); |
| 276 | + String target = findTargetLink(ns,title); |
209 | 277 | if(target != null){ |
210 | | - //System.out.println("Found "+link); |
211 | | - linkkeys.add(target); // for outlink storage |
212 | | - pagelinks.add(target+"|"); // for backlinks |
213 | | - if(anchor != null && !"|".equals(anchor)) |
214 | | - pagelinks.add(target+anchor); // for efficient anchortext extraction |
| 278 | + int targetNs = Integer.parseInt(target.substring(0,target.indexOf(':'))); |
| 279 | + pagelinks.add(target); // for outlink storage |
| 280 | + // register context of this link |
| 281 | + if(context != null && nsf.contains(targetNs)){ |
| 282 | + ArrayList<String> ct = contextMap.get(target); |
| 283 | + if(ct==null){ |
| 284 | + ct = new ArrayList<String>(); |
| 285 | + contextMap.put(target,ct); |
| 286 | + } |
| 287 | + ct.add(context.get(text)); |
| 288 | + } |
215 | 289 | } |
216 | 290 | } |
217 | 291 | } |
218 | 292 | // index article |
219 | | - StringList sl = new StringList(pagelinks); |
220 | | - StringList lk = new StringList(linkkeys); |
| 293 | + StringList lk = new StringList(pagelinks); |
221 | 294 | Analyzer an = new SplitAnalyzer(); |
222 | 295 | Document doc = new Document(); |
223 | | - doc.add(new Field("namespace",t.getNamespaceAsString(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
224 | | - doc.add(new Field("title",t.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
225 | 296 | doc.add(new Field("article_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
226 | 297 | if(redirectsTo != null) |
227 | | - doc.add(new Field("redirect",redirectsTo,Field.Store.YES,Field.Index.UN_TOKENIZED)); |
| 298 | + doc.add(new Field("redirect",redirectsTo+"|"+t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
228 | 299 | else{ |
229 | | - doc.add(new Field("links",sl.toString(),Field.Store.NO,Field.Index.TOKENIZED)); |
230 | | - doc.add(new Field("links_stored",lk.toString(),Field.Store.YES,Field.Index.TOKENIZED)); |
| 300 | + doc.add(new Field("links",lk.toString(),Field.Store.COMPRESS,Field.Index.TOKENIZED)); |
231 | 301 | } |
| 302 | + if(contextMap.size() != 0){ |
| 303 | + /*for(Entry<String,ArrayList<String>> e : contextMap.entrySet()){ |
| 304 | + Document con = new Document(); |
| 305 | + con.add(new Field("context_key",e.getKey()+"|"+t.getKey(),Field.Store.NO,Field.Index.UN_TOKENIZED)); |
| 306 | + con.add(new Field("context",new StringList(e.getValue()).toString(),Field.Store.COMPRESS,Field.Index.NO)); |
| 307 | + writer.addDocument(con,an); |
| 308 | + }*/ |
| 309 | + // serialize the java object (contextMap) into context field |
| 310 | + //ByteArrayOutputStream ba = new ByteArrayOutputStream(); |
| 311 | + //ObjectOutputStream ob = new ObjectOutputStream(ba); |
| 312 | + //ob.writeObject(contextMap); |
| 313 | + //doc.add(new Field("context",ba.toByteArray(),Field.Store.COMPRESS)); |
| 314 | + doc.add(new Field("context",new StringMap(contextMap).serialize(),Field.Store.COMPRESS)); |
| 315 | + } |
232 | 316 | |
233 | 317 | writer.addDocument(doc,an); |
234 | | - state = State.MODIFIED_ARTICLES; |
| 318 | + state = State.MODIFIED; |
235 | 319 | } |
236 | | - public static HashSet<Character> separators = new HashSet<Character>(); |
237 | | - static{ |
238 | | - separators.add(' '); |
239 | | - separators.add('\r'); |
240 | | - separators.add('\n'); |
241 | | - separators.add('\t'); |
242 | | - separators.add(':'); |
243 | | - separators.add('('); |
244 | | - separators.add(')'); |
245 | | - separators.add('['); |
246 | | - separators.add(']'); |
247 | | - separators.add('.'); |
248 | | - separators.add(','); |
249 | | - separators.add(':'); |
250 | | - separators.add(';'); |
251 | | - separators.add('"'); |
252 | | - separators.add('+'); |
253 | | - separators.add('*'); |
254 | | - separators.add('!'); |
255 | | - separators.add('~'); |
256 | | - separators.add('$'); |
257 | | - separators.add('%'); |
258 | | - separators.add('^'); |
259 | | - separators.add('&'); |
260 | | - separators.add('_'); |
261 | | - separators.add('='); |
262 | | - separators.add('|'); |
263 | | - separators.add('\\'); |
264 | | - } |
265 | 320 | |
266 | | - /** |
267 | | - * Find a sentance boundaries |
268 | | - * |
269 | | - * @param text - raw text |
270 | | - * @param start - start index to search from |
271 | | - * @param reverse - if true, will lookup in reverse |
272 | | - * @param max - radius of search (if no boundary is found return last wordbreak) |
273 | | - * @return |
274 | | - */ |
275 | | - protected int findSentance(char[] text, int start, boolean reverse, int max){ |
276 | | - int inc = (reverse)? -1 : 1; |
277 | | - int count = 0; |
278 | | - int wordbreak = start; |
279 | | - int i = start; |
280 | | - for(;i>0 && i<text.length;i+=inc){ |
281 | | - char c = text[i]; |
282 | | - if(c == '.') |
283 | | - return i; |
284 | | - else if(c == '*' && ((i>1 && text[i-1]=='\n') || i==0)) |
285 | | - return i; |
286 | | - else if(separators.contains(c)) |
287 | | - wordbreak = i; |
288 | | - if(count >= max) |
289 | | - return wordbreak; // more than max chars away, return the latest wordbreak |
290 | | - count ++; |
291 | | - } |
292 | | - return i; |
293 | | - } |
294 | | - |
295 | | - /** Find surrounding for a link - extract sentances, list items .... */ |
296 | | - protected String findContext(char[] text, int start, int end){ |
297 | | - // TODO: implement |
298 | | - return null; |
299 | | - } |
300 | | - |
301 | 321 | /** Find the target key to title (ns:title) to which the links is pointing to |
302 | 322 | * @throws IOException */ |
303 | 323 | protected String findTargetLink(int ns, String title) throws IOException{ |
304 | 324 | String key; |
305 | 325 | if(title.length() == 0) |
306 | 326 | return null; |
307 | | - // try exact match |
308 | | - key = ns+":"+title; |
309 | | - if(reader.docFreq(new Term("title_key",key)) != 0) |
310 | | - return key; |
311 | | - // try lowercase |
312 | | - key = ns+":"+title.toLowerCase(); |
313 | | - if(reader.docFreq(new Term("title_key",key)) != 0) |
314 | | - return key; |
315 | | - // try lowercase with first letter upper case |
| 327 | + |
| 328 | + // first letter uppercase |
316 | 329 | if(title.length()==1) |
317 | 330 | key = ns+":"+title.toUpperCase(); |
318 | 331 | else |
319 | | - key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase(); |
320 | | - if(reader.docFreq(new Term("title_key",key)) != 0) |
321 | | - return key; |
322 | | - // try title case |
323 | | - key = ns+":"+WordUtils.capitalize(title); |
324 | | - if(reader.docFreq(new Term("title_key",key)) != 0) |
325 | | - return key; |
326 | | - // try upper case |
327 | | - key = ns+":"+title.toUpperCase(); |
328 | | - if(reader.docFreq(new Term("title_key",key)) != 0) |
329 | | - return key; |
330 | | - // try capitalizing at word breaks |
331 | | - key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'}); |
332 | | - if(reader.docFreq(new Term("title_key",key)) != 0) |
333 | | - return key; |
334 | | - |
335 | | - return null; |
| 332 | + key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1); |
| 333 | + return key; // index everything, even if the target article doesn't exist |
336 | 334 | } |
337 | 335 | |
338 | 336 | /** Get number of backlinks to this title */ |
339 | 337 | public int getNumInLinks(String key) throws IOException{ |
340 | | - return reader.docFreq(new Term("links",key+"|")); |
| 338 | + ensureRead(); |
| 339 | + /*String cacheKey = "getNumInLinks:"+key; |
| 340 | + Object ref = refCache.get(cacheKey); |
| 341 | + if(ref != null) |
| 342 | + return (Integer) ref; |
| 343 | + else{ */ |
| 344 | + int r = reader.docFreq(new Term("links",key)); |
| 345 | + //refCache.put(cacheKey,r); |
| 346 | + return r; |
| 347 | + //} |
341 | 348 | } |
342 | 349 | |
| 350 | + @Deprecated |
343 | 351 | /** Get all article titles that redirect to given title */ |
344 | | - public ArrayList<String> getRedirectsTo(String key) throws IOException{ |
| 352 | + public ArrayList<String> getRedirectsToOld(String key) throws IOException{ |
| 353 | + ensureRead(); |
345 | 354 | ArrayList<String> ret = new ArrayList<String>(); |
346 | 355 | TermDocs td = reader.termDocs(new Term("redirect",key)); |
347 | 356 | while(td.next()){ |
348 | | - ret.add(reader.document(td.doc()).get("article_key")); |
| 357 | + ret.add(reader.document(td.doc(),keyOnly).get("article_key")); |
349 | 358 | } |
350 | 359 | return ret; |
351 | 360 | } |
352 | 361 | |
353 | | - protected void ensureRead() throws IOException { |
354 | | - if(state != State.READ) |
355 | | - flushForRead(); |
| 362 | + /** Get all article titles that redirect to given title */ |
| 363 | + public ArrayList<String> getRedirectsTo(String key) throws IOException{ |
| 364 | + ensureRead(); |
| 365 | + ArrayList<String> ret = new ArrayList<String>(); |
| 366 | + String prefix = key+"|"; |
| 367 | + TermEnum te = reader.terms(new Term("redirect",prefix)); |
| 368 | + while(te.next()){ |
| 369 | + String t = te.term().text(); |
| 370 | + if(t.startsWith(prefix)){ |
| 371 | + ret.add(t.substring(t.indexOf('|')+1)); |
| 372 | + } else |
| 373 | + break; |
| 374 | + } |
| 375 | + return ret; |
356 | 376 | } |
357 | | - |
358 | 377 | |
359 | 378 | /** If an article is a redirect |
360 | 379 | * @throws IOException */ |
— | — | @@ -361,75 +380,43 @@ |
362 | 381 | ensureRead(); |
363 | 382 | TermDocs td = reader.termDocs(new Term("article_key",key)); |
364 | 383 | if(td.next()){ |
365 | | - if(reader.document(td.doc()).get("redirect")!=null) |
| 384 | + if(reader.document(td.doc(),redirectOnly).get("redirect")!=null) |
366 | 385 | return true; |
367 | 386 | } |
368 | 387 | return false; |
369 | 388 | } |
370 | | - |
| 389 | + |
| 390 | + @Deprecated |
371 | 391 | /** If article is redirect, get target, else null */ |
372 | | - public String getRedirectTarget(String key) throws IOException{ |
| 392 | + public String getRedirectTargetOld(String key) throws IOException{ |
373 | 393 | ensureRead(); |
374 | 394 | TermDocs td = reader.termDocs(new Term("article_key",key)); |
375 | 395 | if(td.next()){ |
376 | | - return reader.document(td.doc()).get("redirect"); |
| 396 | + return reader.document(td.doc(),redirectOnly).get("redirect"); |
377 | 397 | } |
378 | 398 | return null; |
379 | 399 | } |
380 | 400 | |
381 | | - /** Get only anchors without frequency */ |
382 | | - public ArrayList<String> getAnchors(String key) throws IOException{ |
| 401 | + /** If article is redirect, get target, else null */ |
| 402 | + public String getRedirectTarget(String key) throws IOException{ |
383 | 403 | ensureRead(); |
384 | | - ArrayList<String> ret = new ArrayList<String>(); |
385 | | - TermEnum te = reader.terms(new Term("links",key+"|")); |
386 | | - while(te.next()){ |
387 | | - String t = te.term().text(); |
388 | | - if(!t.startsWith(key) || !te.term().field().equals("links")) |
389 | | - break; |
390 | | - ret.add(t.substring(key.length()+1)); |
| 404 | + TermDocs td = reader.termDocs(new Term("article_key",key)); |
| 405 | + if(td.next()){ |
| 406 | + String t = reader.document(td.doc(),redirectOnly).get("redirect"); |
| 407 | + return t.substring(t.indexOf('|')+1); |
391 | 408 | } |
392 | | - return ret; |
| 409 | + return null; |
393 | 410 | } |
394 | | - |
395 | | - /** Get title part of the key (ns:title) */ |
396 | | - private String title(String key) { |
397 | | - return key.substring(key.indexOf(':')+1); |
398 | | - } |
399 | 411 | |
400 | | - /** Get anchor texts for given title |
401 | | - * @throws IOException */ |
402 | | - public ArrayList<AnchorText> getAnchorText(String key) throws IOException{ |
403 | | - ensureRead(); |
404 | | - ArrayList<AnchorText> ret = new ArrayList<AnchorText>(); |
405 | | - TermEnum te = reader.terms(new Term("links",key+"|")); |
406 | | - while(te.next()){ |
407 | | - if(!te.term().text().startsWith(key) || !te.term().field().equals("links")) |
408 | | - break; |
409 | | - ret.add(new AnchorText(te.term().text().substring(key.length()),te.docFreq())); |
410 | | - } |
411 | | - return ret; |
412 | | - } |
413 | 412 | |
414 | | - static public class AnchorText { |
415 | | - public String text; /** ns:title **/ |
416 | | - public int freq; |
417 | | - public AnchorText(String text, int freq) { |
418 | | - this.text = text; |
419 | | - this.freq = freq; |
420 | | - } |
421 | | - } |
422 | | - |
423 | | - /** Get all article titles linking to given title |
424 | | - * @throws IOException */ |
425 | | - public ArrayList<String> getInLinks(String key, HashMap<Integer,String> keyCache) throws IOException{ |
| 413 | + /** Return the namespace of the redirect taget (if any) */ |
| 414 | + public int getRedirectTargetNamespace(String key) throws IOException{ |
426 | 415 | ensureRead(); |
427 | | - ArrayList<String> ret = new ArrayList<String>(); |
428 | | - TermDocs td = reader.termDocs(new Term("links",key+"|")); |
429 | | - while(td.next()){ |
430 | | - ret.add(keyCache.get(td.doc())); |
431 | | - //ret.add(reader.document(td.doc()).get("article_key")); |
| 416 | + String t = getRedirectTarget(key); |
| 417 | + if(t != null){ |
| 418 | + return Integer.parseInt(t.substring(t.indexOf('|')+1,t.indexOf(':',t.indexOf('|')))); |
432 | 419 | } |
433 | | - return ret; |
| 420 | + return 0; |
434 | 421 | } |
435 | 422 | |
436 | 423 | /** Get all article titles linking to given title |
— | — | @@ -437,9 +424,11 @@ |
438 | 425 | public ArrayList<CompactArticleLinks> getInLinks(CompactArticleLinks key, HashMap<Integer,CompactArticleLinks> keyCache) throws IOException{ |
439 | 426 | ensureRead(); |
440 | 427 | ArrayList<CompactArticleLinks> ret = new ArrayList<CompactArticleLinks>(); |
441 | | - TermDocs td = reader.termDocs(new Term("links",key+"|")); |
| 428 | + TermDocs td = reader.termDocs(new Term("links",key.toString())); |
442 | 429 | while(td.next()){ |
443 | | - ret.add(keyCache.get(td.doc())); |
| 430 | + CompactArticleLinks cs = keyCache.get(td.doc()); |
| 431 | + if(cs != null) |
| 432 | + ret.add(cs); |
444 | 433 | } |
445 | 434 | return ret; |
446 | 435 | } |
— | — | @@ -449,9 +438,9 @@ |
450 | 439 | public ArrayList<String> getInLinks(String key) throws IOException{ |
451 | 440 | ensureRead(); |
452 | 441 | ArrayList<String> ret = new ArrayList<String>(); |
453 | | - TermDocs td = reader.termDocs(new Term("links",key+"|")); |
| 442 | + TermDocs td = reader.termDocs(new Term("links",key)); |
454 | 443 | while(td.next()){ |
455 | | - ret.add(reader.document(td.doc()).get("article_key")); |
| 444 | + ret.add(reader.document(td.doc(),keyOnly).get("article_key")); |
456 | 445 | } |
457 | 446 | return ret; |
458 | 447 | } |
— | — | @@ -461,60 +450,77 @@ |
462 | 451 | ensureRead(); |
463 | 452 | TermDocs td = reader.termDocs(new Term("article_key",key)); |
464 | 453 | if(td.next()){ |
465 | | - return new StringList(reader.document(td.doc()).get("links_stored")); |
| 454 | + return new StringList(reader.document(td.doc(),linksOnly).get("links")); |
466 | 455 | } |
467 | 456 | return null; |
468 | 457 | } |
469 | 458 | |
470 | | - public Dictionary getKeys() throws IOException{ |
| 459 | + /** Get all contexts in which article <i>to<i/> is linked from <i>from</i>. |
| 460 | + * Will return null if there is no context, or link is invalid. |
| 461 | + * @throws ClassNotFoundException */ |
| 462 | + @SuppressWarnings("unchecked") |
| 463 | + public ArrayList<String> getContext(String from, String to) throws IOException { |
471 | 464 | ensureRead(); |
472 | | - return new LuceneDictionary(reader,"article_key"); |
473 | | - } |
474 | | - @Deprecated |
475 | | - protected void cacheInLinks() throws IOException{ |
476 | | - if(state != State.FLUSHED) |
477 | | - flush(); |
478 | | - log.info("Caching in-links"); |
479 | | - int count = 0; |
480 | | - // docid -> key |
481 | | - HashMap<Integer,String> keyCache = new HashMap<Integer,String>(); |
482 | | - Dictionary dict = new LuceneDictionary(reader,"article_key"); |
483 | | - Word w; |
484 | | - // build key cache |
485 | | - while((w = dict.next()) != null){ |
486 | | - String key = w.getWord(); |
487 | | - TermDocs td = reader.termDocs(new Term("article_key",key)); |
488 | | - if(td.next()){ |
489 | | - keyCache.put(td.doc(),key); |
490 | | - } else |
491 | | - log.error("Cannot find article for key "+key); |
| 465 | + String cacheKey = "getContext:"+from; |
| 466 | + //Element fromCache = cache.get(cacheKey); |
| 467 | + Object fromCache = cache.get(cacheKey); |
| 468 | + if(fromCache != null){ |
| 469 | + //HashMap<String, ArrayList<String>> map = (HashMap<String, ArrayList<String>>) fromCache.getObjectValue(); |
| 470 | + //HashMap<String, ArrayList<String>> map = (HashMap<String, ArrayList<String>>) fromCache; |
| 471 | + StringMap map = (StringMap) fromCache; |
| 472 | + return map.get(to); |
492 | 473 | } |
493 | | - |
494 | | - // get inlinks |
495 | | - for(String key : keyCache.values()){ |
496 | | - ArrayList<String> in = getInLinks(key,keyCache); |
497 | | - Document doc = new Document(); |
498 | | - doc.add(new Field("inlinks_key",key,Field.Store.YES,Field.Index.UN_TOKENIZED)); |
499 | | - doc.add(new Field("inlinks",new StringList(in).toString(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
500 | | - writer.addDocument(doc); |
501 | | - count ++; |
502 | | - if(count % 1000 == 0){ |
503 | | - System.out.println("Cached inlinks for "+count); |
| 474 | + TermDocs td = reader.termDocs(new Term("article_key",from)); |
| 475 | + if(td.next()){ |
| 476 | + byte[] serialized = reader.document(td.doc(),contextOnly).getBinaryValue("context"); |
| 477 | + if(serialized == null) |
| 478 | + return null; |
| 479 | + StringMap map = new StringMap(serialized); |
| 480 | + try { |
| 481 | + //ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(serialized)); |
| 482 | + //HashMap<String, ArrayList<String>> map; |
| 483 | + //map = (HashMap<String, ArrayList<String>>) in.readObject(); |
| 484 | + // cache it ! |
| 485 | + //cache.put(new Element(cacheKey,map)); |
| 486 | + if(from.equals("0:1910") && to.equals("0:April")){ |
| 487 | + int b =0; |
| 488 | + b++; |
| 489 | + } |
| 490 | + cache.put(cacheKey,map); |
| 491 | + return map.get(to); |
| 492 | + /* } catch (ClassNotFoundException e) { |
| 493 | + log.error("For getContext("+from+","+to+") got class not found exception: "+e.getMessage()); |
| 494 | + e.printStackTrace(); // shouldn't happen! */ |
| 495 | + } catch(Exception e){ |
| 496 | + e.printStackTrace(); |
504 | 497 | } |
| 498 | + |
505 | 499 | } |
| 500 | + |
| 501 | + return null; |
506 | 502 | } |
507 | 503 | |
508 | | - /** Get all article titles linking to given title (from inlinks cache) |
509 | | - * @throws IOException */ |
510 | | - public Collection<String> getInLinksFromCache(String key) throws IOException{ |
511 | | - ensureRead(); |
512 | | - TermDocs td = reader.termDocs(new Term("inlinks_key",key)); |
513 | | - while(td.next()){ |
514 | | - return new StringList(reader.document(td.doc()).get("inlinks")).toCollection(); |
| 504 | + /** Get all contexts in which article <i>to<i/> is linked from <i>from</i>. |
| 505 | + * Will return null if there is no context, or link is invalid. |
| 506 | + * @throws ClassNotFoundException */ |
| 507 | + @SuppressWarnings("unchecked") |
| 508 | + public Collection<String> getContextOld(String from, String to) throws IOException { |
| 509 | + ensureRead(); |
| 510 | + |
| 511 | + TermDocs td = reader.termDocs(new Term("context_key",to+"|"+from)); |
| 512 | + if(td.next()){ |
| 513 | + return new StringList(reader.document(td.doc()).get("context")).toCollection(); |
515 | 514 | } |
516 | | - return new ArrayList<String>(); |
| 515 | + |
| 516 | + return null; |
517 | 517 | } |
518 | | - |
| 518 | + |
| 519 | + /** Get a dictionary of all article keys (ns:title) in this index */ |
| 520 | + public Dictionary getKeys() throws IOException{ |
| 521 | + ensureRead(); |
| 522 | + return new LuceneDictionary(reader,"article_key"); |
| 523 | + } |
| 524 | + |
519 | 525 | public Integer getDocId(String key) throws IOException { |
520 | 526 | TermDocs td = reader.termDocs(new Term("article_key",key)); |
521 | 527 | if(td.next()){ |
— | — | @@ -530,7 +536,18 @@ |
531 | 537 | if(reader != null) |
532 | 538 | reader.close(); |
533 | 539 | if(directory != null) |
534 | | - directory.close(); |
535 | | - |
| 540 | + directory.close(); |
536 | 541 | } |
| 542 | + |
| 543 | + public ObjectCache getCache() { |
| 544 | + return cache; |
| 545 | + } |
| 546 | + |
| 547 | + /*public ObjectCache getRefCache() { |
| 548 | + return refCache; |
| 549 | + } */ |
| 550 | + |
| 551 | + |
| 552 | + |
| 553 | + |
537 | 554 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/ContextParser.java |
— | — | @@ -0,0 +1,272 @@ |
| 2 | +package org.wikimedia.lsearch.ranks; |
| 3 | + |
| 4 | +import java.util.ArrayList; |
| 5 | +import java.util.HashSet; |
| 6 | +import java.util.Hashtable; |
| 7 | + |
| 8 | +import org.wikimedia.lsearch.util.Localization; |
| 9 | + |
| 10 | +/** |
| 11 | + * Parse wiki-text into sentences. Each sentence will provide a |
| 12 | + * context for links within it. |
| 13 | + * |
| 14 | + * @author rainman |
| 15 | + * |
| 16 | + */ |
| 17 | +public class ContextParser { |
| 18 | + protected char[] text; |
| 19 | + protected int len; |
| 20 | + protected HashSet<String> imageLocalized = null; |
| 21 | + protected HashSet<String> categoryLocalized = null; |
| 22 | + protected HashSet<String> interwiki = null; |
| 23 | + |
| 24 | + protected ArrayList<Context> contexts = null; |
| 25 | + protected int conIn = 0; |
| 26 | + |
| 27 | + public static class Context { |
| 28 | + int start; |
| 29 | + int end; |
| 30 | + String context = null; |
| 31 | + public Context(int start, int end) { |
| 32 | + this.start = start; |
| 33 | + this.end = end; |
| 34 | + } |
| 35 | + |
| 36 | + public String get(String text){ |
| 37 | + if(context == null) |
| 38 | + context = text.substring(start,end); |
| 39 | + return context; |
| 40 | + } |
| 41 | + |
| 42 | + } |
| 43 | + |
| 44 | + public ContextParser(String text, HashSet<String> imageLocalized, HashSet<String> categoryLocalized, HashSet<String> interwiki){ |
| 45 | + this.text = text.toCharArray(); |
| 46 | + this.len = this.text.length; |
| 47 | + this.imageLocalized = imageLocalized; |
| 48 | + this.categoryLocalized = categoryLocalized; |
| 49 | + this.interwiki = interwiki; |
| 50 | + parse(); |
| 51 | + } |
| 52 | + |
| 53 | + /** Get indexes of boundaries of contexts (usually different sentences) */ |
| 54 | + public ArrayList<Context> getContexts(){ |
| 55 | + return contexts; |
| 56 | + } |
| 57 | + |
| 58 | + /** Get context by index in text, function should be called for incrementaly larger index */ |
| 59 | + public Context getNext(int index){ |
| 60 | + if(conIn >= contexts.size()) |
| 61 | + return null; |
| 62 | + Context c = contexts.get(conIn); |
| 63 | + if(c.start > index) |
| 64 | + return null; |
| 65 | + else{ |
| 66 | + for(;conIn<contexts.size();conIn++){ |
| 67 | + c = contexts.get(conIn); |
| 68 | + if(c.start <= index && index < c.end) |
| 69 | + return c; |
| 70 | + if(c.start > index) |
| 71 | + return null; // no context for this index |
| 72 | + } |
| 73 | + } |
| 74 | + return null; |
| 75 | + } |
| 76 | + |
| 77 | + /** fetch up to 128 chars of prefix */ |
| 78 | + protected String fetchPrefix(int in){ |
| 79 | + int count = 0; |
| 80 | + for(int i=in;i<len;i++,count++){ |
| 81 | + if(count >= 128) |
| 82 | + return null; |
| 83 | + if(text[i] == ':'){ |
| 84 | + return new String(text,in,count); |
| 85 | + } |
| 86 | + } |
| 87 | + return null; |
| 88 | + } |
| 89 | + |
| 90 | + protected void parse(){ |
| 91 | + if(contexts != null) |
| 92 | + return; |
| 93 | + contexts = new ArrayList<Context>(); |
| 94 | + int cur = 0; |
| 95 | + char c; |
| 96 | + boolean seenLetter = false; |
| 97 | + int topLinkLevel = 0; |
| 98 | + boolean inQuotes = false; |
| 99 | + int start = 0; |
| 100 | + for(;cur<len;cur++){ |
| 101 | + c = text[cur]; |
| 102 | + if(!seenLetter && Character.isLetterOrDigit(c)) |
| 103 | + seenLetter = true; |
| 104 | + switch(c){ |
| 105 | + case '[': |
| 106 | + if(cur+2>=len) |
| 107 | + continue; // EOF |
| 108 | + if(text[cur+1]=='['){ |
| 109 | + boolean valid = false; |
| 110 | + int closingInx = -1; |
| 111 | + // seek to see if this is valid link opening |
| 112 | + for(int i=cur+2;i<len && i<cur+512;i++){ |
| 113 | + if(text[i]=='[' && i+1<len && text[i+1]=='[') |
| 114 | + break; // bad internal link |
| 115 | + if(text[i]==']' && i+1<len && text[i+1]==']'){ |
| 116 | + topLinkLevel++; // ok, valid internal link |
| 117 | + closingInx = i+2; |
| 118 | + valid = true; |
| 119 | + break; |
| 120 | + } |
| 121 | + |
| 122 | + } |
| 123 | + // begin of links |
| 124 | + String prefix = fetchPrefix(cur+2); |
| 125 | + if(prefix != null && isImage(prefix)){ |
| 126 | + // take full image caption as one context |
| 127 | + int lastPipe = cur + 2 + prefix.length(); |
| 128 | + int linkLevel = 0; |
| 129 | + int imageEnd = -1; |
| 130 | + for(int i=lastPipe;i<len;i++){ |
| 131 | + if(text[i]=='|') |
| 132 | + lastPipe = i; |
| 133 | + // internal link begin |
| 134 | + if(text[i]=='[' && i+1<len && text[i+1]=='[') |
| 135 | + linkLevel++; |
| 136 | + // internal link end |
| 137 | + if(text[i]==']' && i+1<len && text[i+1]==']'){ |
| 138 | + if(linkLevel == 0){ |
| 139 | + imageEnd = i+1; |
| 140 | + break; |
| 141 | + } else if(linkLevel != 0) |
| 142 | + linkLevel--; |
| 143 | + } |
| 144 | + } |
| 145 | + // add everything up to image as one context |
| 146 | + // and image caption as second context |
| 147 | + if(imageEnd != -1){ |
| 148 | + contexts.add(new Context(lastPipe+1,imageEnd-2)); |
| 149 | + start = imageEnd+1; |
| 150 | + cur = imageEnd; |
| 151 | + } |
| 152 | + } else if(valid && prefix != null && (isCategory(prefix) || isInterwiki(prefix))){ |
| 153 | + // skip categories |
| 154 | + if(seenLetter) |
| 155 | + contexts.add(new Context(start,cur)); |
| 156 | + start = cur; |
| 157 | + cur = closingInx; |
| 158 | + } |
| 159 | + } |
| 160 | + break; |
| 161 | + case 'h': case 'f': |
| 162 | + // check simple http/ftp links |
| 163 | + if(checkPrefix(cur,"http://") || checkPrefix(cur,"ftp://")){ |
| 164 | + if(seenLetter && cur-start>2) |
| 165 | + contexts.add(new Context(start,cur-1)); |
| 166 | + for(;cur<len;cur++){ |
| 167 | + if(text[cur]==' ' || text[cur]==']'){ // seek to after link |
| 168 | + start = cur+1; |
| 169 | + seenLetter = false; |
| 170 | + break; |
| 171 | + } |
| 172 | + } |
| 173 | + } |
| 174 | + break; |
| 175 | + case '<': |
| 176 | + if(checkPrefix(cur,"<tr>") || checkPrefix(cur,"</tr>")){ |
| 177 | + if(seenLetter) |
| 178 | + contexts.add(new Context(start,cur-1)); |
| 179 | + start = cur + 4; |
| 180 | + } |
| 181 | + break; |
| 182 | + case ']': |
| 183 | + if(cur+2>=len) |
| 184 | + continue; // EOF |
| 185 | + if(text[cur+1]==']' && topLinkLevel!=0){ |
| 186 | + topLinkLevel--; |
| 187 | + } |
| 188 | + break; |
| 189 | + case '"': |
| 190 | + // numbers like 6'5" |
| 191 | + if(cur>0 && Character.isDigit(text[cur-1])) |
| 192 | + break; |
| 193 | + inQuotes = !inQuotes; |
| 194 | + break; |
| 195 | + case '=': |
| 196 | + case '!': |
| 197 | + case '?': |
| 198 | + case '{': |
| 199 | + case '}': |
| 200 | + case '*': |
| 201 | + case '#': |
| 202 | + case '|': |
| 203 | + case '.': |
| 204 | + case '\n': |
| 205 | + // whole quote and link text is context |
| 206 | + //if(inQuotes || topLinkLevel!=0) |
| 207 | + // break; |
| 208 | + // only double == is separator (as in headings) |
| 209 | + if(c == '=' && !(cur+1<len && text[cur+1]=='=')) |
| 210 | + break; |
| 211 | + // | is separator in tables, etc.. but not in link syntax like [[x|y]] |
| 212 | + if(c == '|' && topLinkLevel != 0 && (cur+1<len && text[cur+1]!='-')) |
| 213 | + break; |
| 214 | + // dot/comma between numbers |
| 215 | + if((c == '.' || c==',') && (cur>0 && Character.isDigit(text[cur-1]) && cur+1<len && Character.isDigit(text[cur+1]))) |
| 216 | + break; |
| 217 | + // proceed only if this is not paragraph brake (i.e. \n\n) |
| 218 | + if(c == '\n' && !(cur+1<len && (text[cur+1]=='\n' || text[cur+1]==':'))) |
| 219 | + break; |
| 220 | + |
| 221 | + if(seenLetter){ |
| 222 | + contexts.add(new Context(start,cur)); |
| 223 | + start = cur + 1; |
| 224 | + seenLetter = false; |
| 225 | + } |
| 226 | + break; |
| 227 | + } |
| 228 | + } |
| 229 | + if(seenLetter) |
| 230 | + contexts.add(new Context(start,len)); |
| 231 | + } |
| 232 | + |
| 233 | + /** check text from cur position */ |
| 234 | + private boolean checkPrefix(int cur, String prefix) { |
| 235 | + if(cur + prefix.length() < len){ |
| 236 | + for(int i=0;i<prefix.length();i++){ |
| 237 | + if(text[cur+i] != prefix.charAt(i)) |
| 238 | + return false; |
| 239 | + } |
| 240 | + return true; |
| 241 | + } |
| 242 | + return false; |
| 243 | + } |
| 244 | + |
| 245 | + /** Check if this is an "image" keyword using localization */ |
| 246 | + private final boolean isImage(String prefix){ |
| 247 | + prefix = prefix.toLowerCase(); |
| 248 | + if(prefix.equals("image")) |
| 249 | + return true; |
| 250 | + if(imageLocalized!=null && imageLocalized.contains(prefix)) |
| 251 | + return true; |
| 252 | + return false; |
| 253 | + } |
| 254 | + |
| 255 | + private final boolean isCategory(String prefix){ |
| 256 | + prefix = prefix.toLowerCase(); |
| 257 | + if(prefix.equals("category")) |
| 258 | + return true; |
| 259 | + if(categoryLocalized!=null && categoryLocalized.contains(prefix)) |
| 260 | + return true; |
| 261 | + return false; |
| 262 | + } |
| 263 | + |
| 264 | + private final boolean isInterwiki(String prefix){ |
| 265 | + if(interwiki!=null) |
| 266 | + return interwiki.contains(prefix); |
| 267 | + else |
| 268 | + return false; |
| 269 | + } |
| 270 | + |
| 271 | + |
| 272 | + |
| 273 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RankBuilder.java |
— | — | @@ -56,7 +56,6 @@ |
57 | 57 | public static void main(String[] args) throws IOException { |
58 | 58 | String inputfile = null; |
59 | 59 | String dbname = null; |
60 | | - boolean useExistingTemp = false; |
61 | 60 | |
62 | 61 | System.out.println("MediaWiki Lucene search indexer - build rank info from xml dumps.\n"); |
63 | 62 | |
— | — | @@ -64,15 +63,11 @@ |
65 | 64 | log = Logger.getLogger(RankBuilder.class); |
66 | 65 | |
67 | 66 | if(args.length < 2){ |
68 | | - System.out.println("Syntax: java RankBuilder [-t] <inputfile> <dbname>"); |
69 | | - System.out.println("Options:"); |
70 | | - System.out.println(" -t - use existing temporary ranking index"); |
| 67 | + System.out.println("Syntax: java RankBuilder <inputfile> <dbname>"); |
71 | 68 | return; |
72 | 69 | } |
73 | 70 | for(int i=0;i<args.length;i++){ |
74 | | - if(args[i].equals("-t")) |
75 | | - useExistingTemp = true; |
76 | | - else if(inputfile == null) |
| 71 | + if(inputfile == null) |
77 | 72 | inputfile = args[i]; |
78 | 73 | else if(dbname == null) |
79 | 74 | dbname = args[i]; |
— | — | @@ -92,59 +87,22 @@ |
93 | 88 | long start = System.currentTimeMillis(); |
94 | 89 | |
95 | 90 | // link info |
96 | | - Links links = null; |
97 | | - if(useExistingTemp) |
98 | | - links = Links.openExisting(iid); |
99 | | - else |
100 | | - links = processLinks(inputfile,getTitles(inputfile,langCode,iid),langCode); |
101 | | - //links.cacheInLinks(); |
102 | | - /*log.info("Creating ref count cache"); |
103 | | - HashMap<Integer,Integer> refCache = new HashMap<Integer,Integer>(); |
104 | | - HashMap<Integer,String> keyCache = new HashMap<Integer,String>(); |
105 | | - HashMap<String,Integer> docIdCache = new HashMap<String,Integer>(); |
106 | | - Word w; Dictionary d = links.getKeys(); |
107 | | - while((w = d.next()) != null){ |
108 | | - String key = w.getWord(); |
109 | | - int docid = links.getDocId(key); |
110 | | - refCache.put(docid,links.getNumInLinks(key)); |
111 | | - keyCache.put(docid,key); |
112 | | - docIdCache.put(key,docid); |
| 91 | + Links links = Links.createNew(iid); |
| 92 | + try{ |
| 93 | + processLinks(inputfile,links,iid,langCode); |
| 94 | + } catch(IOException e){ |
| 95 | + log.fatal("I/O error processing "+inputfile+" : "+e.getMessage()); |
| 96 | + e.printStackTrace(); |
113 | 97 | } |
114 | | - log.info("Caching in/out links"); |
115 | | - HashMap<Integer,int[]> outLinkCache = new HashMap<Integer,int[]>(); |
116 | | - HashMap<Integer,int[]> inLinkCache = new HashMap<Integer,int[]>(); |
117 | | - // cache in/out links |
118 | | - d = links.getKeys(); |
119 | | - while((w = d.next()) != null){ |
120 | | - String key = w.getWord(); |
121 | | - int docid = docIdCache.get(key); |
122 | | - Collection<String> in = links.getInLinks(key,keyCache); |
123 | | - int[] inset = new int[in.size()]; |
124 | | - int i=0; |
125 | | - for(String k : in) |
126 | | - inset[i++] = docIdCache.get(k); |
127 | | - inLinkCache.put(docid,inset); |
128 | | - |
129 | | - Collection<String> out = links.getOutLinks(key).toCollection(); |
130 | | - int[] outset = new int[out.size()]; |
131 | | - i = 0; |
132 | | - for(String k : out){ |
133 | | - outset[i++] = docIdCache.get(k); |
134 | | - } |
135 | | - outLinkCache.put(docid,outset); |
136 | | - } |
137 | | - storeLinkAnalysis(links,iid,docIdCache,keyCache,refCache,inLinkCache,outLinkCache); */ |
138 | | - storeLinkAnalysis(links,iid); |
139 | | - //Storage store = Storage.getInstance(); |
140 | | - //store.storePageReferences(links.getAll(),dbname); |
141 | | - //storeRelated(store,links,dbname); |
142 | | - |
| 98 | + |
| 99 | + IndexThread.makeIndexSnapshot(iid.getLinks(),iid.getLinks().getImportPath()); |
| 100 | + |
143 | 101 | long end = System.currentTimeMillis(); |
144 | 102 | |
145 | 103 | System.out.println("Finished generating ranks in "+formatTime(end-start)); |
146 | 104 | } |
147 | 105 | |
148 | | - //public static void storeLinkAnalysis(Links links, IndexId iid, HashMap<String, Integer> docIdCache, HashMap<Integer, String> keyCache, HashMap<Integer, Integer> refCache, HashMap<Integer, int[]> inLinkCache, HashMap<Integer, int[]> outLinkCache) throws IOException{ |
| 106 | + @Deprecated |
149 | 107 | public static void storeLinkAnalysis(Links links, IndexId iid) throws IOException{ |
150 | 108 | log.info("Storing link analysis data"); |
151 | 109 | LinkAnalysisStorage store = new LinkAnalysisStorage(iid); |
— | — | @@ -154,7 +112,7 @@ |
155 | 113 | String key = w.getWord(); |
156 | 114 | int ref = links.getNumInLinks(key); |
157 | 115 | String redirectTarget = links.getRedirectTarget(key); |
158 | | - ArrayList<String> anchor = links.getAnchors(key); |
| 116 | + ArrayList<String> anchor = null; //links.getAnchors(key); |
159 | 117 | ArrayList<Related> related = new ArrayList<Related>(); //FIXME: too slow getRelated(key,links,refCount,keyCache); |
160 | 118 | //ArrayList<Related> related = getRelated(key,links,docIdCache,keyCache,refCache,inLinkCache,outLinkCache); |
161 | 119 | ArrayList<String> redirect = links.getRedirectsTo(key); |
— | — | @@ -164,53 +122,16 @@ |
165 | 123 | |
166 | 124 | } |
167 | 125 | |
168 | | - public static Links processLinks(String inputfile, Links links, String langCode) { |
169 | | - log.info("Second pass, calculating article links..."); |
170 | | - InputStream input = null; |
171 | | - // second pass - calculate page ranks |
172 | | - try { |
173 | | - input = Tools.openInputFile(inputfile); |
174 | | - } catch (IOException e) { |
175 | | - log.fatal("I/O error opening "+inputfile+" : "+e.getMessage()); |
176 | | - return null; |
177 | | - } |
| 126 | + public static Links processLinks(String inputfile, Links links, IndexId iid, String langCode) throws IOException { |
| 127 | + log.info("Calculating article links..."); |
| 128 | + InputStream input = Tools.openInputFile(inputfile); |
178 | 129 | // calculate ranks |
179 | | - LinkReader rr = new LinkReader(links,langCode); |
| 130 | + LinkReader rr = new LinkReader(links,iid,langCode); |
180 | 131 | XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000)); |
181 | | - try { |
182 | | - reader.readDump(); |
183 | | - links.flush(); |
184 | | - } catch (IOException e) { |
185 | | - log.fatal("I/O error reading dump while calculating ranks for from "+inputfile+" : "+e.getMessage()); |
186 | | - return null; |
187 | | - } |
| 132 | + reader.readDump(); |
| 133 | + links.flush(); |
188 | 134 | return links; |
189 | 135 | } |
190 | | - |
191 | | - public static Links getTitles(String inputfile,String langCode,IndexId iid) { |
192 | | - log.info("First pass, getting a list of valid articles..."); |
193 | | - InputStream input = null; |
194 | | - try { |
195 | | - input = Tools.openInputFile(inputfile); |
196 | | - } catch (IOException e) { |
197 | | - log.fatal("I/O error opening "+inputfile+" : "+e.getMessage()); |
198 | | - return null; |
199 | | - } |
200 | | - try { |
201 | | - // first pass, get titles |
202 | | - TitleReader tr = new TitleReader(langCode,iid); |
203 | | - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000)); |
204 | | - reader.readDump(); |
205 | | - input.close(); |
206 | | - Links links = tr.getLinks(); |
207 | | - links.flush(); |
208 | | - return links; |
209 | | - } catch (IOException e) { |
210 | | - log.fatal("I/O error reading dump while getting titles from "+inputfile+" : "+e.getMessage()); |
211 | | - return null; |
212 | | - } |
213 | | - |
214 | | - } |
215 | 136 | |
216 | 137 | /** |
217 | 138 | * Get related articles, sorted descending by score |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/StringMap.java |
— | — | @@ -0,0 +1,198 @@ |
| 2 | +package org.wikimedia.lsearch.ranks; |
| 3 | + |
| 4 | +import java.io.ByteArrayInputStream; |
| 5 | +import java.io.ByteArrayOutputStream; |
| 6 | +import java.io.DataInputStream; |
| 7 | +import java.io.DataOutputStream; |
| 8 | +import java.io.EOFException; |
| 9 | +import java.io.IOException; |
| 10 | +import java.io.UnsupportedEncodingException; |
| 11 | +import java.nio.ByteBuffer; |
| 12 | +import java.util.ArrayList; |
| 13 | +import java.util.Collections; |
| 14 | +import java.util.Comparator; |
| 15 | +import java.util.HashMap; |
| 16 | +import java.util.HashSet; |
| 17 | +import java.util.Map.Entry; |
| 18 | + |
| 19 | +public class StringMap { |
| 20 | + protected static final int BUFFER_SIZE = 300; |
| 21 | + protected char[] buf = new char[BUFFER_SIZE]; |
| 22 | + protected int len = 0, pos = 0; |
| 23 | + protected HashMap<String,ArrayList<String>> map = null; |
| 24 | + protected HashMap<Integer,ArrayList<Integer>> hashMap = null; |
| 25 | + protected byte[] serialized = null; |
| 26 | + public static final char DELIMITER = '\0'; |
| 27 | + protected final int INT_SIZE = Integer.SIZE / 8; |
| 28 | + |
| 29 | + public StringMap(HashMap<String,ArrayList<String>> map){ |
| 30 | + this.map = map; |
| 31 | + } |
| 32 | + |
| 33 | + public StringMap(byte[] serialized) throws IOException{ |
| 34 | + this.serialized = serialized; |
| 35 | + readHash(); |
| 36 | + } |
| 37 | + |
| 38 | + /** initialize the small hashmap at the beggining of the stream */ |
| 39 | + private void readHash() throws IOException { |
| 40 | + hashMap = new HashMap<Integer,ArrayList<Integer>>(); |
| 41 | + ByteArrayInputStream ba = new ByteArrayInputStream(serialized); |
| 42 | + DataInputStream di = new DataInputStream(ba); |
| 43 | + int size = di.readInt(); |
| 44 | + for(int i=0;i<size;i++){ |
| 45 | + int hash = di.readInt(); |
| 46 | + ArrayList<Integer> pos = hashMap.get(hash); |
| 47 | + if(pos == null){ |
| 48 | + pos = new ArrayList<Integer>(); |
| 49 | + hashMap.put(hash,pos); |
| 50 | + } |
| 51 | + pos.add(di.readInt()); |
| 52 | + } |
| 53 | + } |
| 54 | + |
| 55 | + protected int encLen(String str) throws UnsupportedEncodingException{ |
| 56 | + return str.getBytes("utf-8").length; |
| 57 | + } |
| 58 | + |
| 59 | + public byte[] serialize() throws IOException{ |
| 60 | + if(serialized != null) |
| 61 | + return serialized; |
| 62 | + // unique string, string -> index (within string segment) |
| 63 | + HashMap<String,Integer> strings = new HashMap<String,Integer>(); |
| 64 | + // hash -> list of keys |
| 65 | + HashMap<Integer,ArrayList<String>> hashs = new HashMap<Integer,ArrayList<String>>(); |
| 66 | + // contexts, key -> index of string (from strings) |
| 67 | + HashMap<String,ArrayList<Integer>> contexts = new HashMap<String,ArrayList<Integer>>(); |
| 68 | + // keys in some order |
| 69 | + ArrayList<String> keys = new ArrayList<String>(); |
| 70 | + keys.addAll(map.keySet()); |
| 71 | + int offset = 0; |
| 72 | + for(String key : keys){ |
| 73 | + // mapping hash -> keys |
| 74 | + int hash = key.hashCode(); |
| 75 | + ArrayList<String> hk = hashs.get(hash); |
| 76 | + if(hk == null){ |
| 77 | + hk = new ArrayList<String>(); |
| 78 | + hashs.put(hash,hk); |
| 79 | + } |
| 80 | + hk.add(key); |
| 81 | + // contexts |
| 82 | + ArrayList<Integer> cc = new ArrayList<Integer>(); |
| 83 | + contexts.put(key,cc); |
| 84 | + for(String s : map.get(key)){ |
| 85 | + // identifier |
| 86 | + Integer i = strings.get(s); |
| 87 | + if(i == null){ |
| 88 | + i = offset; |
| 89 | + strings.put(s,i); |
| 90 | + offset += encLen(s) + INT_SIZE; |
| 91 | + } |
| 92 | + cc.add(i); |
| 93 | + } |
| 94 | + } |
| 95 | + int keyOffset = INT_SIZE+2*INT_SIZE*map.size(); |
| 96 | + int stringOffset = keyOffset; |
| 97 | + // key -> offset |
| 98 | + HashMap<String,Integer> keyOffsets = new HashMap<String,Integer>(); |
| 99 | + for(String key : keys){ |
| 100 | + keyOffsets.put(key,stringOffset); |
| 101 | + stringOffset += INT_SIZE+encLen(key)+INT_SIZE+contexts.get(key).size()*INT_SIZE; |
| 102 | + } |
| 103 | + // serialize! |
| 104 | + ByteArrayOutputStream ba = new ByteArrayOutputStream(); |
| 105 | + DataOutputStream ds = new DataOutputStream(ba); |
| 106 | + ds.writeInt(hashs.size()); |
| 107 | + // write out the hashmap |
| 108 | + ArrayList<Entry<Integer,ArrayList<String>>> sortedHash = new ArrayList<Entry<Integer,ArrayList<String>>>(); |
| 109 | + sortedHash.addAll(hashs.entrySet()); |
| 110 | + Collections.sort(sortedHash,new Comparator<Entry<Integer,ArrayList<String>>>(){ |
| 111 | + public int compare(Entry<Integer, ArrayList<String>> o1, Entry<Integer, ArrayList<String>> o2) { |
| 112 | + return o1.getKey() - o2.getKey(); |
| 113 | + } |
| 114 | + }); |
| 115 | + // write pairs [ hash] [ position of key ] |
| 116 | + for(Entry<Integer,ArrayList<String>> e : sortedHash){ |
| 117 | + int hash = e.getKey(); |
| 118 | + for(String key : e.getValue()){ |
| 119 | + ds.writeInt(hash); |
| 120 | + ds.writeInt(keyOffsets.get(key)); |
| 121 | + } |
| 122 | + } |
| 123 | + // write: [ key.length ] [ key ] [context1_pos] [context2_pos] ... |
| 124 | + for(String key : keys){ |
| 125 | + byte[] b = key.getBytes("utf-8"); |
| 126 | + ds.writeInt(b.length); |
| 127 | + ds.write(b); |
| 128 | + ArrayList<Integer> con = contexts.get(key); |
| 129 | + if(con == null || con.size()==0) |
| 130 | + ds.writeInt(0); |
| 131 | + else{ |
| 132 | + ds.writeInt(con.size()); |
| 133 | + for(Integer index : con){ |
| 134 | + ds.writeInt(stringOffset+index); |
| 135 | + } |
| 136 | + } |
| 137 | + } |
| 138 | + // write string as [size] [string] |
| 139 | + HashSet<String> written = new HashSet<String>(); |
| 140 | + for(String key : keys){ |
| 141 | + for(String c : map.get(key)){ |
| 142 | + if(written.contains(c)) |
| 143 | + continue; |
| 144 | + byte[] b = c.getBytes("utf-8"); |
| 145 | + ds.writeInt(b.length); |
| 146 | + ds.write(b); |
| 147 | + written.add(c); |
| 148 | + } |
| 149 | + } |
| 150 | + serialized = ba.toByteArray(); |
| 151 | + return serialized; |
| 152 | + } |
| 153 | + |
| 154 | + private final int read(){ |
| 155 | + return serialized[pos++] & 0xff; |
| 156 | + } |
| 157 | + |
| 158 | + protected int readInt() throws IOException { |
| 159 | + int ch1 = read(); |
| 160 | + int ch2 = read(); |
| 161 | + int ch3 = read(); |
| 162 | + int ch4 = read(); |
| 163 | + if ((ch1 | ch2 | ch3 | ch4) < 0) |
| 164 | + throw new EOFException(); |
| 165 | + return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0)); |
| 166 | + } |
| 167 | + |
| 168 | + protected String readString() throws IOException{ |
| 169 | + int len = readInt(); |
| 170 | + int start = pos; |
| 171 | + pos+=len; |
| 172 | + return new String(serialized,start,len,"utf-8"); |
| 173 | + } |
| 174 | + |
| 175 | + /** Get an array of string for a key |
| 176 | + * @throws IOException */ |
| 177 | + public ArrayList<String> get(String key) throws IOException{ |
| 178 | + ArrayList<String> ret = new ArrayList<String>(); |
| 179 | + if(!hashMap.containsKey(key.hashCode())) |
| 180 | + return ret; |
| 181 | + for(Integer p : hashMap.get(key.hashCode())){ |
| 182 | + pos = p; |
| 183 | + String k = readString(); |
| 184 | + if(key.equals(k)){ |
| 185 | + // found key, read context |
| 186 | + int num = readInt(); |
| 187 | + int[] strings = new int[num]; |
| 188 | + for(int i=0;i<num;i++){ |
| 189 | + strings[i] = readInt(); |
| 190 | + } |
| 191 | + for(int strpos : strings){ |
| 192 | + pos = strpos; |
| 193 | + ret.add(readString()); |
| 194 | + } |
| 195 | + } |
| 196 | + } |
| 197 | + return ret; |
| 198 | + } |
| 199 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/ObjectCache.java |
— | — | @@ -0,0 +1,67 @@ |
| 2 | +package org.wikimedia.lsearch.ranks; |
| 3 | + |
| 4 | +import java.util.HashMap; |
| 5 | +import java.util.LinkedList; |
| 6 | +import java.util.WeakHashMap; |
| 7 | + |
| 8 | +/** |
| 9 | + * Maintain a cache of objects. Cache is a simple FIFO cache of |
| 10 | + * constant size. Oldest entries get replaced by newer ones. |
| 11 | + * |
| 12 | + * @author rainman |
| 13 | + * |
| 14 | + */ |
| 15 | +public class ObjectCache { |
| 16 | + /** used to maintain FIFO cache of valid keys */ |
| 17 | + protected String[] fifo; |
| 18 | + /** storage of objects */ |
| 19 | + protected HashMap<String,Object> objs = new HashMap<String,Object>(); |
| 20 | + protected int size, inx; |
| 21 | + |
| 22 | + protected long hits = 0; |
| 23 | + protected long miss = 0; |
| 24 | + |
| 25 | + protected int report = 0; |
| 26 | + |
| 27 | + public ObjectCache(int size){ |
| 28 | + this.size = size; |
| 29 | + this.fifo = new String[size]; |
| 30 | + this.inx = 0; |
| 31 | + } |
| 32 | + |
| 33 | + public void put(String key, Object obj){ |
| 34 | + // add to FIFO queue only if not already in it |
| 35 | + if(!objs.containsKey(key)){ |
| 36 | + if(inx >= size) |
| 37 | + inx = 0; |
| 38 | + String del = fifo[inx]; |
| 39 | + if(del != null){ |
| 40 | + //remove oldest from cache |
| 41 | + objs.remove(del); |
| 42 | + } |
| 43 | + fifo[inx] = key; // latest cached key |
| 44 | + inx++; |
| 45 | + } |
| 46 | + objs.put(key,obj); |
| 47 | + } |
| 48 | + |
| 49 | + public Object get(String key){ |
| 50 | + if(++report >= 5000){ |
| 51 | + report = 0; |
| 52 | + System.out.println(getStats()); |
| 53 | + } |
| 54 | + Object obj = objs.get(key); |
| 55 | + if(obj !=null ) |
| 56 | + hits++; |
| 57 | + else |
| 58 | + miss++; |
| 59 | + return obj; |
| 60 | + } |
| 61 | + |
| 62 | + public String getStats(){ |
| 63 | + long total = hits+miss; |
| 64 | + return "HITS: "+hits+" ("+((float)hits*100/total)+"%), MISS: "+miss+" ("+((float)miss*100/total)+"%)"; |
| 65 | + } |
| 66 | + |
| 67 | + |
| 68 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java |
— | — | @@ -30,6 +30,7 @@ |
31 | 31 | import org.apache.lucene.analysis.Analyzer; |
32 | 32 | import org.apache.lucene.analysis.TokenStream; |
33 | 33 | import org.wikimedia.lsearch.analyzers.WikiTokenizer; |
| 34 | +import org.wikimedia.lsearch.config.IndexId; |
34 | 35 | |
35 | 36 | /** |
36 | 37 | * @author Kate Turner |
— | — | @@ -58,6 +59,6 @@ |
59 | 60 | if(streams.get(fieldName) != null) |
60 | 61 | return streams.get(fieldName); |
61 | 62 | |
62 | | - return new AliasPorterStemFilter(new WikiTokenizer(text,"en",false)); |
| 63 | + return new AliasPorterStemFilter(new WikiTokenizer(text,IndexId.get("enwiki"),false)); |
63 | 64 | } |
64 | 65 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java |
— | — | @@ -18,6 +18,7 @@ |
19 | 19 | import org.wikimedia.lsearch.analyzers.WikiQueryParser.NamespacePolicy; |
20 | 20 | import org.wikimedia.lsearch.config.Configuration; |
21 | 21 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
| 22 | +import org.wikimedia.lsearch.config.IndexId; |
22 | 23 | import org.wikimedia.lsearch.index.WikiIndexModifier; |
23 | 24 | import org.wikimedia.lsearch.search.NamespaceFilter; |
24 | 25 | |
— | — | @@ -40,7 +41,7 @@ |
41 | 42 | WikiQueryParser.KEYWORD_BOOST = 0.05f; |
42 | 43 | WikiQueryParser.ADD_TITLE_PHRASES = false; |
43 | 44 | WikiIndexModifier.ALT_TITLES = 3; |
44 | | - FieldBuilder.BuilderSet bs = new FieldBuilder("").getBuilder(); |
| 45 | + FieldBuilder.BuilderSet bs = new FieldBuilder(IndexId.get("enwiki")).getBuilder(); |
45 | 46 | FieldNameFactory ff = new FieldNameFactory(); |
46 | 47 | try{ |
47 | 48 | WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),new SimpleAnalyzer(),bs,null); |
— | — | @@ -126,9 +127,10 @@ |
127 | 128 | // extraction of phrases |
128 | 129 | ArrayList<String> stopWords = new ArrayList<String>(); |
129 | 130 | stopWords.add("the"); stopWords.add("who"); |
130 | | - stopWords.add("is"); stopWords.add("a"); |
131 | | - Analyzer analyzer = Analyzers.getSearcherAnalyzer("en"); |
132 | | - bs = new FieldBuilder("en").getBuilder(); |
| 131 | + stopWords.add("is"); stopWords.add("a"); |
| 132 | + IndexId enwiki = IndexId.get("enwiki"); |
| 133 | + Analyzer analyzer = Analyzers.getSearcherAnalyzer(enwiki); |
| 134 | + bs = new FieldBuilder(enwiki).getBuilder(); |
133 | 135 | parser = new WikiQueryParser(bs.getFields().title(),"0",analyzer,bs,NamespacePolicy.IGNORE,stopWords); |
134 | 136 | assertEquals("[how, do, you, do]",parser.extractWords(parser.parseRaw("how do you do")).toString()); |
135 | 137 | assertEquals("[making, something, rest]",parser.extractWords(parser.parseRaw("(help:making something incategory:blah) OR (rest incategory:crest)")).toString()); |
— | — | @@ -230,8 +232,8 @@ |
231 | 233 | // ================================== |
232 | 234 | // Tests with actual params :) |
233 | 235 | // ================================== |
234 | | - analyzer = Analyzers.getSearcherAnalyzer("en"); |
235 | | - bs = new FieldBuilder("en").getBuilder(); |
| 236 | + analyzer = Analyzers.getSearcherAnalyzer(enwiki); |
| 237 | + bs = new FieldBuilder(enwiki).getBuilder(); |
236 | 238 | parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE); |
237 | 239 | WikiQueryParser.ADD_STEM_TITLE = false; |
238 | 240 | WikiQueryParser.STEM_TITLE_BOOST = 0; |
— | — | @@ -354,8 +356,8 @@ |
355 | 357 | assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(contents:godel contents:goedel)) (+title:something^2.0 +title:for^2.0 +(title:godel^2.0 title:goedel^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godel^6.0 alttitle1:goedel^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godel^6.0 alttitle2:goedel^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godel^6.0 alttitle3:goedel^6.0)))",q.toString()); |
356 | 358 | |
357 | 359 | // Backward compatiblity for complex filters |
358 | | - analyzer = Analyzers.getSearcherAnalyzer("en"); |
359 | | - bs = new FieldBuilder("en").getBuilder(); |
| 360 | + analyzer = Analyzers.getSearcherAnalyzer(enwiki); |
| 361 | + bs = new FieldBuilder(enwiki).getBuilder(); |
360 | 362 | parser = new WikiQueryParser(bs.getFields().contents(),"0,1,4,12",analyzer,bs,NamespacePolicy.IGNORE); |
361 | 363 | |
362 | 364 | q = parser.parseTwoPass("beans everyone",NamespacePolicy.REWRITE); |
— | — | @@ -381,15 +383,15 @@ |
382 | 384 | assertEquals("[(many,1,5), (more,7,11), (has,16,19), (some,23,27), (g,29,30)]",t.toString()); |
383 | 385 | |
384 | 386 | // German |
385 | | - analyzer = Analyzers.getSearcherAnalyzer("de"); |
386 | | - bs = new FieldBuilder("de").getBuilder(); |
| 387 | + analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("dewiki")); |
| 388 | + bs = new FieldBuilder(IndexId.get("dewiki")).getBuilder(); |
387 | 389 | parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE); |
388 | 390 | q = parser.parseTwoPass("welche rolle spielen Mineralstoffe in der Ernährung?",NamespacePolicy.IGNORE); |
389 | 391 | assertEquals("(+(contents:welche contents:welch^0.5) +(contents:rolle contents:roll^0.5) +(contents:spielen contents:spiel^0.5) +(contents:mineralstoffe contents:mineralstoff^0.5) +contents:in +contents:der +(+(contents:ernahrung contents:ernahr^0.5) (contents:ernaehrung contents:ernaehr^0.5))) (+title:welche^2.0 +title:rolle^2.0 +title:spielen^2.0 +title:mineralstoffe^2.0 +title:in^2.0 +title:der^2.0 +(title:ernahrung^2.0 title:ernaehrung^2.0))",q.toString()); |
390 | 392 | |
391 | 393 | // CJK |
392 | | - analyzer = Analyzers.getSearcherAnalyzer("ja"); |
393 | | - bs = new FieldBuilder("ja").getBuilder(); |
| 394 | + analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")); |
| 395 | + bs = new FieldBuilder(IndexId.get("jawiki")).getBuilder(); |
394 | 396 | parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE); |
395 | 397 | q = parser.parseFourPass("うろパン",NamespacePolicy.IGNORE,false); |
396 | 398 | assertEquals("contents:\"うろ ろハ ハン\" title:\"うろ ろハ ハン\"^2.0 (alttitle1:\"うろ ろハ ハン\"^6.0 alttitle2:\"うろ ろハ ハン\"^6.0 alttitle3:\"うろ ろハ ハン\"^6.0)",q.toString()); |
— | — | @@ -402,8 +404,8 @@ |
403 | 405 | |
404 | 406 | |
405 | 407 | // Malayalam |
406 | | - analyzer = Analyzers.getSearcherAnalyzer("ml"); |
407 | | - bs = new FieldBuilder("ml").getBuilder(); |
| 408 | + analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("mlwiki")); |
| 409 | + bs = new FieldBuilder(IndexId.get("mlwiki")).getBuilder(); |
408 | 410 | parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE); |
409 | 411 | q = parser.parseFourPass("കൊറിയ ",NamespacePolicy.IGNORE,false); |
410 | 412 | assertEquals("contents:കറയ title:കറയ^2.0 (alttitle1:കറയ^6.0 alttitle2:കറയ^6.0 alttitle3:കറയ^6.0)",q.toString()); |
— | — | @@ -420,8 +422,8 @@ |
421 | 423 | WikiQueryParser.STEM_TITLE_BOOST = 1; |
422 | 424 | |
423 | 425 | // Localization tests |
424 | | - analyzer = Analyzers.getSearcherAnalyzer("sr"); |
425 | | - bs = new FieldBuilder("sr").getBuilder(); |
| 426 | + analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("srwiki")); |
| 427 | + bs = new FieldBuilder(IndexId.get("srwiki")).getBuilder(); |
426 | 428 | parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE); |
427 | 429 | |
428 | 430 | q = parser.parseTwoPass("all:добродошли на википедију",NamespacePolicy.IGNORE); |
— | — | @@ -430,8 +432,8 @@ |
431 | 433 | q = parser.parseTwoPass("all:dobrodošli na šđčćž",NamespacePolicy.IGNORE); |
432 | 434 | assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^3.0 +title:na^3.0 +title:sdjccz^3.0)",q.toString()); |
433 | 435 | |
434 | | - analyzer = Analyzers.getSearcherAnalyzer("th"); |
435 | | - bs = new FieldBuilder("th").getBuilder(); |
| 436 | + analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("thwiki")); |
| 437 | + bs = new FieldBuilder(IndexId.get("thwiki")).getBuilder(); |
436 | 438 | parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE); |
437 | 439 | |
438 | 440 | q = parser.parseTwoPass("ภาษาไทย",NamespacePolicy.IGNORE); |
— | — | @@ -441,8 +443,8 @@ |
442 | 444 | assertEquals("(+namespace:12 +(+contents:ภาษา +contents:ไทย)) (+namespace:12 +(+title:ภาษา^3.0 +title:ไทย^3.0))",q.toString()); |
443 | 445 | |
444 | 446 | // vietnamese |
445 | | - analyzer = Analyzers.getSearcherAnalyzer("vi"); |
446 | | - bs = new FieldBuilder("vi").getBuilder(); |
| 447 | + analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("viwiki")); |
| 448 | + bs = new FieldBuilder(IndexId.get("viwiki")).getBuilder(); |
447 | 449 | parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE); |
448 | 450 | |
449 | 451 | q = parser.parseTwoPass("Gánh nước đêm trăng",NamespacePolicy.IGNORE); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -11,11 +11,12 @@ |
12 | 12 | import org.apache.lucene.analysis.Token; |
13 | 13 | import org.apache.lucene.analysis.TokenStream; |
14 | 14 | import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
| 15 | +import org.wikimedia.lsearch.config.IndexId; |
15 | 16 | import org.wikimedia.lsearch.index.WikiIndexModifier; |
16 | 17 | |
17 | 18 | public class FastWikiTokenizerTest { |
18 | 19 | public static void displayTokensForParser(String text) { |
19 | | - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"en",false); |
| 20 | + FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false); |
20 | 21 | Token[] tokens = parser.parse().toArray(new Token[] {}); |
21 | 22 | for (int i = 0; i < tokens.length; i++) { |
22 | 23 | Token token = tokens[i]; |
— | — | @@ -116,7 +117,7 @@ |
117 | 118 | for(int i=0;i<2000;i++){ |
118 | 119 | for(TestArticle article : articles){ |
119 | 120 | String text = article.content; |
120 | | - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"en",false); |
| 121 | + FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false); |
121 | 122 | parser.parse(); |
122 | 123 | } |
123 | 124 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java |
— | — | @@ -94,6 +94,8 @@ |
95 | 95 | {"cource", "course"}, |
96 | 96 | {"carolene products",""}, |
97 | 97 | {"orvileWright","overnight"}, |
| 98 | + {"livia tremor control","olivia tremor control"}, |
| 99 | + {"ommmited","omitted"}, |
98 | 100 | |
99 | 101 | }; |
100 | 102 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java |
— | — | @@ -86,6 +86,10 @@ |
87 | 87 | sendOutputLine("#no suggestion"); |
88 | 88 | for(ResultSet rs : res.getResults()){ |
89 | 89 | sendResultLine(rs.score, rs.namespace, rs.title); |
| 90 | + if(rs.getContext() != null){ |
| 91 | + for(String c : rs.getContext()) |
| 92 | + sendOutputLine("#context "+c); |
| 93 | + } |
90 | 94 | if(rs.getExplanation() != null) |
91 | 95 | sendOutputLine(rs.getExplanation().toString()); |
92 | 96 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/HttpHandler.java |
— | — | @@ -41,7 +41,7 @@ |
42 | 42 | protected String postData; |
43 | 43 | |
44 | 44 | protected final int BUF_SIZE = 8192; |
45 | | - protected final char[] outputBuffer = new char[BUF_SIZE]; |
| 45 | + protected char[] outputBuffer = new char[BUF_SIZE]; |
46 | 46 | protected int bufLength = 0; |
47 | 47 | |
48 | 48 | protected int minorVersion; // the x in HTTP 1.x |
— | — | @@ -227,9 +227,12 @@ |
228 | 228 | log.debug(">>>"+sout); |
229 | 229 | // write to buffer instead directly to stream! |
230 | 230 | char[] s = (sout+"\r\n").toCharArray(); |
231 | | - if(bufLength + s.length >= BUF_SIZE) |
| 231 | + if(bufLength + s.length >= outputBuffer.length) |
232 | 232 | flushOutput(); |
233 | | - // FIXME: what if array is 2x larger than buffer? |
| 233 | + // extend buffer if needed |
| 234 | + if(s.length > bufLength){ |
| 235 | + outputBuffer = new char[s.length*2]; |
| 236 | + } |
234 | 237 | System.arraycopy(s,0,outputBuffer,bufLength,s.length); |
235 | 238 | bufLength+=s.length; |
236 | 239 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java |
— | — | @@ -46,7 +46,7 @@ |
47 | 47 | Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,false); |
48 | 48 | NamespaceFilter nsDefault = new NamespaceFilter("0"); // default to main namespace |
49 | 49 | FieldBuilder.Case dCase = FieldBuilder.Case.IGNORE_CASE; |
50 | | - FieldBuilder.BuilderSet bs = new FieldBuilder(global.getLanguage(iid.getDBname()),dCase).getBuilder(dCase); |
| 50 | + FieldBuilder.BuilderSet bs = new FieldBuilder(iid,dCase).getBuilder(dCase); |
51 | 51 | WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),nsDefault,analyzer,bs,WikiQueryParser.NamespacePolicy.IGNORE,null); |
52 | 52 | while(true){ |
53 | 53 | System.out.print(">> "); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java |
— | — | @@ -42,7 +42,7 @@ |
43 | 43 | public CleanIndexWriter(IndexId iid) throws IOException{ |
44 | 44 | GlobalConfiguration global = GlobalConfiguration.getInstance(); |
45 | 45 | this.iid = iid; |
46 | | - this.builder = new FieldBuilder("",FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK); |
| 46 | + this.builder = new FieldBuilder(iid,FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK); |
47 | 47 | this.langCode = global.getLanguage(iid.getDBname()); |
48 | 48 | HashSet<String> stopWords = new HashSet<String>(); |
49 | 49 | for(String w : StopWords.getStopWords(iid,langCode)) |
— | — | @@ -90,7 +90,7 @@ |
91 | 91 | if(!WikiIndexModifier.checkAddPreconditions(a,langCode)) |
92 | 92 | return; // don't add if preconditions are not met |
93 | 93 | |
94 | | - Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid); |
| 94 | + Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid,null); |
95 | 95 | Document doc = (Document) ret[0]; |
96 | 96 | Analyzer analyzer = (Analyzer) ret[1]; |
97 | 97 | try { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java |
— | — | @@ -176,13 +176,13 @@ |
177 | 177 | } |
178 | 178 | |
179 | 179 | /** |
180 | | - * Register a title in the index, without tokenization, just lowercase. |
| 180 | + * Register a title in the index, without tokenization, strip of accents and such. |
181 | 181 | * |
182 | 182 | * @param title |
183 | 183 | */ |
184 | 184 | public void addTitle(String title){ |
185 | 185 | Document doc = new Document(); |
186 | | - doc.add(new Field("title", title.toLowerCase(), Field.Store.NO, Field.Index.UN_TOKENIZED)); |
| 186 | + doc.add(new Field("title", FastWikiTokenizerEngine.stipTitle(title.toLowerCase()), Field.Store.NO, Field.Index.UN_TOKENIZED)); |
187 | 187 | ngramWriter.addDocument(doc); |
188 | 188 | } |
189 | 189 | /** |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/CompactRelated.java |
— | — | @@ -36,7 +36,7 @@ |
37 | 37 | this.title = title; |
38 | 38 | } |
39 | 39 | public String serialize(){ |
40 | | - return score+" "+relates; |
| 40 | + return ((float)score)+" "+relates; |
41 | 41 | } |
42 | 42 | |
43 | 43 | public static ArrayList<String> convertToStringList(Collection<CompactRelated> rel){ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedBuilder.java |
— | — | @@ -18,11 +18,14 @@ |
19 | 19 | import org.mediawiki.dumper.ProgressFilter; |
20 | 20 | import org.mediawiki.dumper.Tools; |
21 | 21 | import org.mediawiki.importer.XmlDumpReader; |
| 22 | +import org.wikimedia.lsearch.beans.Title; |
22 | 23 | import org.wikimedia.lsearch.config.Configuration; |
23 | 24 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
24 | 25 | import org.wikimedia.lsearch.config.IndexId; |
| 26 | +import org.wikimedia.lsearch.config.IndexRegistry; |
25 | 27 | import org.wikimedia.lsearch.index.IndexThread; |
26 | 28 | import org.wikimedia.lsearch.ranks.Links; |
| 29 | +import org.wikimedia.lsearch.search.NamespaceFilter; |
27 | 30 | import org.wikimedia.lsearch.spell.api.Dictionary; |
28 | 31 | import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
29 | 32 | import org.wikimedia.lsearch.storage.ArticleAnalytics; |
— | — | @@ -46,7 +49,7 @@ |
47 | 50 | System.out.println("MediaWiki Lucene search indexer - build a map of related articles.\n"); |
48 | 51 | |
49 | 52 | Configuration.open(); |
50 | | - if(args.length > 2 && args.length < 1){ |
| 53 | + if(args.length > 2 || args.length < 1){ |
51 | 54 | System.out.println("Syntax: java RelatedBuilder <dbname> [<dump file>]"); |
52 | 55 | return; |
53 | 56 | } |
— | — | @@ -64,7 +67,7 @@ |
65 | 68 | if(dumpfile != null) |
66 | 69 | rebuildFromDump(dumpfile,iid); |
67 | 70 | else |
68 | | - rebuildFromTemp(iid); |
| 71 | + rebuildFromLinks(iid); |
69 | 72 | } catch (IOException e) { |
70 | 73 | log.fatal("Rebuild I/O error: "+e.getMessage()); |
71 | 74 | e.printStackTrace(); |
— | — | @@ -83,7 +86,8 @@ |
84 | 87 | // first pass - titles |
85 | 88 | InputStream input = null; |
86 | 89 | input = Tools.openInputFile(inputfile); |
87 | | - TitleReader tr = new TitleReader(langCode); |
| 90 | + NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid); |
| 91 | + TitleReader tr = new TitleReader(iid,langCode,nsf); |
88 | 92 | XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000)); |
89 | 93 | reader.readDump(); |
90 | 94 | input.close(); |
— | — | @@ -104,32 +108,42 @@ |
105 | 109 | * Rebuild related articles index for iid |
106 | 110 | * @throws IOException |
107 | 111 | */ |
108 | | - public static void rebuildFromTemp(IndexId iid) throws IOException { |
| 112 | + public static void rebuildFromLinks(IndexId iid) throws IOException { |
109 | 113 | CompactLinks links = new CompactLinks(); |
110 | | - Links temp = Links.openExisting(iid); |
| 114 | + Links temp = Links.openForRead(iid,iid.getLinks().getImportPath()); |
111 | 115 | |
112 | | - log.info("Reading all titles"); |
| 116 | + NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid); |
| 117 | + log.info("Reading titles in default search"); |
113 | 118 | Dictionary dict = temp.getKeys(); |
114 | 119 | Word w; |
115 | 120 | HashMap<Integer,CompactArticleLinks> keyCache = new HashMap<Integer,CompactArticleLinks>(); |
116 | 121 | while((w = dict.next()) != null){ |
117 | 122 | String key = w.getWord(); |
118 | | - links.add(key,temp.getNumInLinks(key)); |
119 | | - keyCache.put(temp.getDocId(key),links.get(key)); |
| 123 | + int ns = Integer.parseInt(key.substring(0,key.indexOf(':'))); |
| 124 | + if(nsf.contains(ns)){ |
| 125 | + links.add(key,temp.getNumInLinks(key)); |
| 126 | + keyCache.put(temp.getDocId(key),links.get(key)); |
| 127 | + } |
120 | 128 | } |
121 | 129 | |
122 | 130 | log.info("Reading in/out links"); |
123 | 131 | dict = temp.getKeys(); |
124 | 132 | while((w = dict.next()) != null){ |
125 | 133 | String key = w.getWord(); |
126 | | - CompactArticleLinks l = links.get(key); |
127 | | - // inlinks |
128 | | - l.setInLinks(temp.getInLinks(l,keyCache)); |
129 | | - // outlinks |
130 | | - ArrayList<CompactArticleLinks> out = new ArrayList<CompactArticleLinks>(); |
131 | | - for(String k : temp.getOutLinks(key).toCollection()) |
132 | | - out.add(links.get(k)); |
133 | | - l.setOutLinks(out); |
| 134 | + int ns = Integer.parseInt(key.substring(0,key.indexOf(':'))); |
| 135 | + if(nsf.contains(ns)){ |
| 136 | + CompactArticleLinks l = links.get(key); |
| 137 | + // inlinks |
| 138 | + l.setInLinks(temp.getInLinks(l,keyCache)); |
| 139 | + // outlinks |
| 140 | + ArrayList<CompactArticleLinks> out = new ArrayList<CompactArticleLinks>(); |
| 141 | + for(String k : temp.getOutLinks(key).toCollection()){ |
| 142 | + CompactArticleLinks cs = links.get(k); |
| 143 | + if(cs != null) |
| 144 | + out.add(cs); |
| 145 | + } |
| 146 | + l.setOutLinks(out); |
| 147 | + } |
134 | 148 | } |
135 | 149 | temp.close(); |
136 | 150 | temp = null; // GC |
— | — | @@ -144,14 +158,19 @@ |
145 | 159 | RelatedStorage store = new RelatedStorage(iid); |
146 | 160 | int num = 0; |
147 | 161 | int total = links.getAll().size(); |
148 | | - for(CompactArticleLinks cs : links.getAll()){ |
| 162 | + NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid); |
| 163 | + for(CompactArticleLinks cs : links.getAll()){ |
149 | 164 | num++; |
150 | 165 | if(num % 1000 == 0) |
151 | | - log.info("Storing ["+num+"/"+total+"]"); |
152 | | - ArrayList<CompactRelated> rel = getRelated(cs,links); |
153 | | - if(rel.size() == 0) |
154 | | - continue; |
155 | | - store.addRelated(cs.toString(),rel); |
| 166 | + log.info("Storing ["+num+"/"+total+"]"); |
| 167 | + Title t = new Title(cs.getKey()); |
| 168 | + // do analysis only for default search namespace (usually main namespace) |
| 169 | + if(nsf.contains(t.getNamespace())){ |
| 170 | + ArrayList<CompactRelated> rel = getRelated(cs,links); |
| 171 | + if(rel.size() == 0) |
| 172 | + continue; |
| 173 | + store.addRelated(cs.toString(),rel); |
| 174 | + } |
156 | 175 | } |
157 | 176 | store.snapshot(); |
158 | 177 | } |
— | — | @@ -161,15 +180,19 @@ |
162 | 181 | */ |
163 | 182 | public static ArrayList<CompactRelated> getRelated(CompactArticleLinks cs, CompactLinks links){ |
164 | 183 | ArrayList<CompactRelated> ret = new ArrayList<CompactRelated>(); |
165 | | - |
166 | | - HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>(); |
| 184 | + |
| 185 | + HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>(); |
| 186 | + double maxnorm = 0; // maximal value for related score, used for scaling |
167 | 187 | if(cs.linksIn != null){ |
168 | | - for(CompactArticleLinks csl : cs.linksIn) |
| 188 | + for(CompactArticleLinks csl : cs.linksIn){ |
169 | 189 | ll.add(csl); |
| 190 | + maxnorm += 1.0/norm(csl.links); |
| 191 | + } |
170 | 192 | } |
171 | 193 | for(CompactArticleLinks from : ll){ |
172 | 194 | if(from != cs){ |
173 | | - double score = relatedScore(cs,ll,from); |
| 195 | + double rscore = relatedScore(cs,ll,from); |
| 196 | + double score = (rscore / maxnorm) * rscore; |
174 | 197 | if(score != 0) |
175 | 198 | ret.add(new CompactRelated(cs,from,score)); |
176 | 199 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/LinkReader.java |
— | — | @@ -80,34 +80,14 @@ |
81 | 81 | rank = links.get(key); |
82 | 82 | if(rank != null) |
83 | 83 | return rank; |
84 | | - // try lowercase |
85 | | - key = ns+":"+title.toLowerCase(); |
86 | | - rank = links.get(key); |
87 | | - if(rank != null) |
88 | | - return rank; |
89 | 84 | // try lowercase with first letter upper case |
90 | 85 | if(title.length()==1) |
91 | 86 | key = ns+":"+title.toUpperCase(); |
92 | 87 | else |
93 | | - key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase(); |
| 88 | + key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1); |
94 | 89 | rank = links.get(key); |
95 | 90 | if(rank != null) |
96 | 91 | return rank; |
97 | | - // try title case |
98 | | - key = ns+":"+WordUtils.capitalize(title); |
99 | | - rank = links.get(key); |
100 | | - if(rank != null) |
101 | | - return rank; |
102 | | - // try upper case |
103 | | - key = ns+":"+title.toUpperCase(); |
104 | | - rank = links.get(key); |
105 | | - if(rank != null) |
106 | | - return rank; |
107 | | - // try capitalizing at word breaks |
108 | | - key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'}); |
109 | | - rank = links.get(key); |
110 | | - if(rank != null) |
111 | | - return rank; |
112 | 92 | |
113 | 93 | return null; |
114 | 94 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedTitle.java |
— | — | @@ -1,10 +1,13 @@ |
2 | 2 | package org.wikimedia.lsearch.related; |
3 | 3 | |
| 4 | +import java.util.ArrayList; |
| 5 | + |
4 | 6 | import org.wikimedia.lsearch.beans.Title; |
5 | 7 | |
6 | 8 | public class RelatedTitle { |
7 | 9 | protected Title related; |
8 | 10 | protected double score; |
| 11 | + protected ArrayList<String> contexts = null; |
9 | 12 | |
10 | 13 | public RelatedTitle(Title related, double score) { |
11 | 14 | this.related = related; |
— | — | @@ -22,6 +25,12 @@ |
23 | 26 | public void setScore(double score) { |
24 | 27 | this.score = score; |
25 | 28 | } |
| 29 | + public ArrayList<String> getContexts() { |
| 30 | + return contexts; |
| 31 | + } |
| 32 | + public void setContexts(ArrayList<String> contexts) { |
| 33 | + this.contexts = contexts; |
| 34 | + } |
26 | 35 | @Override |
27 | 36 | public String toString() { |
28 | 37 | return related.toString()+" ("+score+")"; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/TitleReader.java |
— | — | @@ -12,6 +12,8 @@ |
13 | 13 | import org.mediawiki.importer.Revision; |
14 | 14 | import org.mediawiki.importer.Siteinfo; |
15 | 15 | import org.wikimedia.lsearch.beans.ArticleLinks; |
| 16 | +import org.wikimedia.lsearch.config.IndexId; |
| 17 | +import org.wikimedia.lsearch.search.NamespaceFilter; |
16 | 18 | import org.wikimedia.lsearch.util.Localization; |
17 | 19 | |
18 | 20 | /** |
— | — | @@ -25,9 +27,13 @@ |
26 | 28 | Revision revision; |
27 | 29 | CompactLinks links = new CompactLinks(); |
28 | 30 | protected String langCode; |
| 31 | + protected IndexId iid; |
| 32 | + protected NamespaceFilter nsf; |
29 | 33 | |
30 | | - public TitleReader(String langCode){ |
| 34 | + public TitleReader(IndexId iid, String langCode, NamespaceFilter nsf){ |
31 | 35 | this.langCode = langCode; |
| 36 | + this.iid = iid; |
| 37 | + this.nsf = nsf; |
32 | 38 | } |
33 | 39 | |
34 | 40 | public void writeRevision(Revision revision) throws IOException { |
— | — | @@ -37,8 +43,10 @@ |
38 | 44 | this.page = page; |
39 | 45 | } |
40 | 46 | public void writeEndPage() throws IOException { |
41 | | - String key = page.Title.Namespace+":"+page.Title.Text; |
42 | | - links.add(key,0); |
| 47 | + if(nsf.contains(page.Title.Namespace)){ |
| 48 | + String key = page.Title.Namespace+":"+page.Title.Text; |
| 49 | + links.add(key,0); |
| 50 | + } |
43 | 51 | } |
44 | 52 | public CompactLinks getTitles() { |
45 | 53 | return links; |
— | — | @@ -54,7 +62,7 @@ |
55 | 63 | Iterator it = info.Namespaces.orderedEntries(); |
56 | 64 | while(it.hasNext()){ |
57 | 65 | Entry<Integer,String> pair = (Entry<Integer,String>)it.next(); |
58 | | - Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode); |
| 66 | + Localization.addCustomMapping(pair.getValue(),pair.getKey(),iid.getDBname()); |
59 | 67 | } |
60 | 68 | } |
61 | 69 | public void writeStartWiki() throws IOException { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java |
— | — | @@ -73,7 +73,7 @@ |
74 | 74 | log.debug("Calling remotely indexUpdate("+myhost+","+iid+") on "+host); |
75 | 75 | r.indexUpdated(myhost,iid.toString()); |
76 | 76 | } catch (Exception e) { |
77 | | - log.warn("Error invoking remote method notifyIndexUpdated() on host "+host); |
| 77 | + log.warn("Error invoking remote method notifyIndexUpdated() on host "+host+" : "+e.getMessage()); |
78 | 78 | continue; |
79 | 79 | } |
80 | 80 | } |
— | — | @@ -102,10 +102,10 @@ |
103 | 103 | log.debug("Got new RMI messenger for host "+host); |
104 | 104 | return r; |
105 | 105 | } catch (RemoteException e) { |
106 | | - log.warn("Cannot contact RMI registry for host "+host); |
| 106 | + log.warn("Cannot contact RMI registry for host "+host+" : "+e.getMessage()); |
107 | 107 | throw e; |
108 | 108 | } catch (NotBoundException e) { |
109 | | - log.warn("No RMIMessenger instance at host "+host); |
| 109 | + log.warn("No RMIMessenger instance at host "+host+" : "+e.getMessage()); |
110 | 110 | throw e; |
111 | 111 | } |
112 | 112 | } |
— | — | @@ -126,7 +126,7 @@ |
127 | 127 | return res; |
128 | 128 | } catch (Exception e) { |
129 | 129 | //e.printStackTrace(); |
130 | | - log.warn("Error invoking remote method getIndexTimestamp() on host "+host); |
| 130 | + log.warn("Error invoking remote method getIndexTimestamp() on host "+host+" : "+e.getMessage()); |
131 | 131 | } |
132 | 132 | return null; |
133 | 133 | } |
— | — | @@ -137,7 +137,7 @@ |
138 | 138 | log.debug("Calling enqueueUpdateRecords("+records.length+" records) on "+host); |
139 | 139 | r.enqueueUpdateRecords(records); |
140 | 140 | } catch (Exception e) { |
141 | | - log.warn("Error invoking remote method enqueueUpdateRecords() on host "+host); |
| 141 | + log.warn("Error invoking remote method enqueueUpdateRecords() on host "+host+" : "+e.getMessage()); |
142 | 142 | throw e; |
143 | 143 | } |
144 | 144 | } |
— | — | @@ -148,7 +148,7 @@ |
149 | 149 | log.debug("Calling enqueueFrontend("+records.length+" records) on "+host); |
150 | 150 | r.enqueueFrontend(records); |
151 | 151 | } catch (Exception e) { |
152 | | - log.warn("Error invoking remote method enqueueFrontend() on host "+host); |
| 152 | + log.warn("Error invoking remote method enqueueFrontend() on host "+host+" : "+e.getMessage()); |
153 | 153 | throw e; |
154 | 154 | } |
155 | 155 | } |
— | — | @@ -159,7 +159,7 @@ |
160 | 160 | log.debug("Calling reportBack("+cards.length+" records) on "+host); |
161 | 161 | r.reportBack(cards); |
162 | 162 | } catch (Exception e) { |
163 | | - log.warn("Error invoking remote method sendReports on host "+host); |
| 163 | + log.warn("Error invoking remote method sendReports on host "+host+" : "+e.getMessage()); |
164 | 164 | } |
165 | 165 | } |
166 | 166 | |
— | — | @@ -177,7 +177,7 @@ |
178 | 178 | cache.invalidateSearchable(iid,host); |
179 | 179 | SearchResults res = new SearchResults(); |
180 | 180 | res.retry(); |
181 | | - log.warn("Error invoking remote method searchPart on host "+host); |
| 181 | + log.warn("Error invoking remote method searchPart on host "+host+" : "+e.getMessage()); |
182 | 182 | return res; |
183 | 183 | } |
184 | 184 | } |
— | — | @@ -188,7 +188,7 @@ |
189 | 189 | log.debug("Calling requestFlushAndNotify("+dbname+" records) on "+host); |
190 | 190 | return r.requestFlushAndNotify(dbname); |
191 | 191 | } catch (Exception e) { |
192 | | - log.warn("Error invoking remote method requestFlushAndNotify on host "+host); |
| 192 | + log.warn("Error invoking remote method requestFlushAndNotify on host "+host+" : "+e.getMessage()); |
193 | 193 | return false; |
194 | 194 | } |
195 | 195 | } |
— | — | @@ -199,7 +199,7 @@ |
200 | 200 | log.debug("Calling isSuccessfulFlush("+dbname+" records) on "+host); |
201 | 201 | return r.isSuccessfulFlush(dbname); |
202 | 202 | } catch (Exception e) { |
203 | | - log.warn("Error invoking remote method isSuccessfulFlush on host "+host); |
| 203 | + log.warn("Error invoking remote method isSuccessfulFlush on host "+host+" : "+e.getMessage()); |
204 | 204 | throw new IOException("Remote error"); |
205 | 205 | } |
206 | 206 | } |
— | — | @@ -212,7 +212,7 @@ |
213 | 213 | log.debug(" \\-> got: "+size); |
214 | 214 | return size; |
215 | 215 | } catch (Exception e) { |
216 | | - log.warn("Error invoking remote method getIndexerQueueSize on host "+host); |
| 216 | + log.warn("Error invoking remote method getIndexerQueueSize on host "+host+" : "+e.getMessage()); |
217 | 217 | return -1; |
218 | 218 | } |
219 | 219 | } |