Index: trunk/extensions/wikidiff2/wikidiff2.cpp |
— | — | @@ -325,6 +325,13 @@ |
326 | 326 | } |
327 | 327 | |
328 | 328 | // Split a string into words |
| 329 | +// |
| 330 | +// TODO: I think the best way to do this would be to use ICU BreakIterator |
| 331 | +// instead of libthai + DIY. Basically you'd run BreakIterators from several |
| 332 | +// different locales (en, th, ja) and merge the results, i.e. if a break occurs |
| 333 | +// in any locale at a given position, split the string. I don't know if the |
| 334 | +// quality of the Thai dictionary in ICU matches the one in libthai, we would |
| 335 | +// have to check this somehow. |
329 | 336 | void Wikidiff2::explodeWords(const String & text, WordVector &words) |
330 | 337 | { |
331 | 338 | // Don't try to do a word-level diff on very long lines |
— | — | @@ -360,10 +367,12 @@ |
361 | 368 | tisText += (char)thaiChar; |
362 | 369 | charSizes += (char)(p - charStart); |
363 | 370 | |
364 | | - if (!isSpace(ch) && lastChar && isSpace(lastChar)) { |
| 371 | + if (isLetter(ch)) { |
| 372 | + if (lastChar && !isLetter(lastChar)) { |
| 373 | + breaks.insert(charIndex); |
| 374 | + } |
| 375 | + } else { |
365 | 376 | breaks.insert(charIndex); |
366 | | - } else if (isChineseJapanese(ch)) { |
367 | | - breaks.insert(charIndex); |
368 | 377 | } |
369 | 378 | charIndex++; |
370 | 379 | lastChar = ch; |
Index: trunk/extensions/wikidiff2/debian/changelog |
— | — | @@ -1,3 +1,9 @@ |
| 2 | +php5-wikidiff2 (1.1.1-1) lucid; urgency=low |
| 3 | + |
| 4 | + * Fixed bug 33331 (word breaking around punctuation) |
| 5 | + |
| 6 | + -- Tim Starling <tstarling@wikimedia.org> Fri, 23 Dec 2011 15:33:40 +1100 |
| 7 | + |
2 | 8 | php5-wikidiff2 (1.1.0-2) lucid; urgency=low |
3 | 9 | |
4 | 10 | * Include a config file so the extension loads |
Index: trunk/extensions/wikidiff2/tests/003.phpt |
— | — | @@ -28,7 +28,7 @@ |
29 | 29 | </tr> |
30 | 30 | <tr> |
31 | 31 | <td class="diff-marker">−</td> |
32 | | - <td class="diff-deletedline"><div><span class="diffchange diffchange-inline">!!FUZZY!!Rajaa</span></div></td> |
| 32 | + <td class="diff-deletedline"><div><span class="diffchange diffchange-inline">!!FUZZY!!</span>Rajaa</div></td> |
33 | 33 | <td class="diff-marker">+</td> |
34 | | - <td class="diff-addedline"><div><span class="diffchange diffchange-inline">Rajaa</span></div></td> |
| 34 | + <td class="diff-addedline"><div>Rajaa</div></td> |
35 | 35 | </tr> |
Index: trunk/extensions/wikidiff2/wikidiff2.h |
— | — | @@ -43,7 +43,7 @@ |
44 | 44 | void printWordDiffSide(WordDiff &worddiff, bool added); |
45 | 45 | void printTextWithDiv(const String & input); |
46 | 46 | void printText(const String & input); |
47 | | - inline bool isChineseJapanese(int ch); |
| 47 | + inline bool isLetter(int ch); |
48 | 48 | inline bool isSpace(int ch); |
49 | 49 | void debugPrintWordDiff(WordDiff & worddiff); |
50 | 50 | |
— | — | @@ -54,11 +54,23 @@ |
55 | 55 | void explodeLines(const String & text, StringVector &lines); |
56 | 56 | }; |
57 | 57 | |
58 | | -bool Wikidiff2::isChineseJapanese(int ch) |
| 58 | +bool Wikidiff2::isLetter(int ch) |
59 | 59 | { |
60 | | - if (ch >= 0x3000 && ch <= 0x9fff) return true; |
61 | | - if (ch >= 0x20000 && ch <= 0x2a000) return true; |
62 | | - return false; |
| 60 | + // Standard alphanumeric |
| 61 | + if ((ch >= '0' && ch <= '9') || |
| 62 | + (ch == '_') || |
| 63 | + (ch >= 'A' && ch <= 'Z') || |
| 64 | + (ch >= 'a' && ch <= 'z')) |
| 65 | + { |
| 66 | + return true; |
| 67 | + } |
| 68 | + // Punctuation and control characters |
| 69 | + if (ch < 0xc0) return false; |
| 70 | + // Chinese, Japanese: split up character by character |
| 71 | + if (ch >= 0x3000 && ch <= 0x9fff) return false; |
| 72 | + if (ch >= 0x20000 && ch <= 0x2a000) return false; |
| 73 | + // Otherwise assume it's from a language that uses spaces |
| 74 | + return true; |
63 | 75 | } |
64 | 76 | |
65 | 77 | bool Wikidiff2::isSpace(int ch) |