r65466 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r65465‎ | r65466 | r65467 >
Date:16:03, 23 April 2010
Author:daniel
Status:deferred
Tags:
Comment:
NameHashTrial
Modified paths:
  • /trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/NameMaps.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/store/builder/NameHashTrial.java (added) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/store/builder/NameHashTrial.java
@@ -0,0 +1,57 @@
 2+package de.brightbyte.wikiword.store.builder;
 3+
 4+import java.io.BufferedReader;
 5+import java.io.BufferedWriter;
 6+import java.io.FileInputStream;
 7+import java.io.FileOutputStream;
 8+import java.io.IOException;
 9+import java.io.InputStream;
 10+import java.io.InputStreamReader;
 11+import java.io.OutputStream;
 12+import java.io.OutputStreamWriter;
 13+import java.io.PrintWriter;
 14+import java.security.NoSuchAlgorithmException;
 15+
 16+import de.brightbyte.data.Functor;
 17+import de.brightbyte.util.PersistenceException;
 18+import de.brightbyte.wikiword.builder.NameMaps;
 19+
 20+public class NameHashTrial {
 21+ public static void main(String[] args) throws IOException, PersistenceException, NoSuchAlgorithmException, InterruptedException {
 22+ String params = args[0];
 23+ int limit = Integer.parseInt(args[1]);
 24+
 25+ Functor<?, String> hash = NameMaps.newHash(params, "en");
 26+
 27+ InputStream rawIn = args.length>2 && !args[2].equals("-") ? new FileInputStream(args[2]) : System.in;
 28+ OutputStream rawOut = args.length>3 && !args[3].equals("-") ? new FileOutputStream(args[3]) : System.out;
 29+
 30+ BufferedReader in = new BufferedReader(new InputStreamReader(rawIn, "UTF-8"));
 31+ PrintWriter out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(rawOut, "UTF-8")));
 32+
 33+ long start = System.nanoTime();
 34+
 35+ System.out.println("Reading input...");
 36+ String s;
 37+ int c = 0;
 38+ while ((s = in.readLine()) != null) {
 39+ c++;
 40+ if (c>limit) break;
 41+
 42+ Object h = hash.apply(s);
 43+
 44+ out.println(h+"\t"+s);
 45+ if (rawOut==System.out) out.flush();
 46+
 47+ if (c % 10000 == 0) System.out.format(" at %d\n", c);
 48+ }
 49+
 50+ if (rawOut!=System.out) out.close();
 51+ else out.flush();
 52+
 53+ if (rawIn!=System.in) in.close();
 54+
 55+ long t = System.nanoTime() - start;
 56+ System.out.format("Processed %d entries in %01.3f sec\n", c, t/1000000000.0);
 57+ }
 58+}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/NameMaps.java
@@ -1,6 +1,5 @@
22 package de.brightbyte.wikiword.builder;
33
4 -import java.io.File;
54 import java.io.IOException;
65 import java.io.UnsupportedEncodingException;
76 import java.net.URL;
@@ -8,12 +7,8 @@
98 import java.util.Arrays;
109 import java.util.HashMap;
1110 import java.util.HashSet;
12 -import java.util.Map;
1311 import java.util.Set;
1412
15 -import org.ardverk.collection.PatriciaTrie;
16 -import org.ardverk.collection.StringKeyAnalyzer;
17 -
1813 import de.brightbyte.data.BlockDigest;
1914 import de.brightbyte.data.ByteString;
2015 import de.brightbyte.data.Codec;
@@ -22,7 +17,6 @@
2318 import de.brightbyte.data.KeyValueStore;
2419 import de.brightbyte.data.LongIntLookup;
2520 import de.brightbyte.data.MapLookup;
26 -import de.brightbyte.data.XorFold32;
2721 import de.brightbyte.data.XorFold64;
2822 import de.brightbyte.data.XorWrap;
2923 import de.brightbyte.io.HuffmanDataCodec;
@@ -43,14 +37,62 @@
4438 return new HashMap<String, V>();
4539 }*/
4640
 41+ protected static Set<String> parseParams(String spec) {
 42+ String[] tt = spec.split("[,;|+/ &]+");
 43+
 44+ Set<String> params = new HashSet<String>();
 45+ params.addAll(Arrays.asList(tt));
 46+
 47+ return params;
 48+ }
 49+
 50+ public static Functor<?, String> newHash(String params, String lang) {
 51+ return newHash(parseParams(params), lang);
 52+ }
 53+
 54+ public static Functor<?, String> newHash(Set<String> params, String lang) {
 55+ //initial digest turns string into UTF-8 bytes
 56+ Functor<byte[], String> digest;
 57+
 58+ try {
 59+ if (params.contains("utf8")) digest = new Codec.Encoder<String, byte[]>(new CharsetCodec("UTF-8"));
 60+ else digest = new Codec.Encoder<String, byte[]>(new CharsetCodec("UTF-16"));
 61+
 62+ //apply md5 digest or huffman compression
 63+ if (params.contains("md5")) digest = new Functor.Composite<byte[], byte[], String>(digest, new BlockDigest("MD5"));
 64+ else if (params.contains("sha1")) digest = new Functor.Composite<byte[], byte[], String>(digest, new BlockDigest("SHA-1"));
 65+ else if (params.contains("huff") || params.contains("huffman")) digest = new Functor.Composite<byte[], byte[], String>(digest, getHuffmanEncoder(lang));
 66+ } catch (UnsupportedEncodingException e) {
 67+ throw new IllegalArgumentException(e);
 68+ } catch (NoSuchAlgorithmException e) {
 69+ throw new IllegalArgumentException(e);
 70+ } catch (IOException e) {
 71+ throw new RuntimeException(e);
 72+ }
 73+
 74+ if (params.contains("fold64")) { //fold into Long
 75+ Functor<Long, byte[]> fold;
 76+ fold = XorFold64.instance;
 77+
 78+ Functor<Long, String> convert = new Functor.Composite<Long, byte[], String>(digest, fold);
 79+ return convert;
 80+ } else { //keep bytes, wrap in ByteArray
 81+ if (params.contains("wrap8")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(8));
 82+ else if (params.contains("wrap6")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(6));
 83+ else if (params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4));
 84+ else if (params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4));
 85+
 86+ //create converter that includes wrapping the byte array in a ByteString
 87+ Functor<ByteString, String> convert = new Functor.Composite<ByteString, byte[], String>(digest, ByteString.wrap);
 88+ return convert;
 89+ }
 90+ }
 91+
4792 public static KeyValueStore<String, Integer> newStore(String storeParams, String lang) {
4893 KeyValueStore<String, Integer> store = null;
4994
50 - String[] tt = storeParams.split("[,;|+/ &]+");
 95+ Set<String> params = parseParams(storeParams);
5196
52 - Set<String> params = new HashSet<String>();
53 - params.addAll(Arrays.asList(tt));
54 -
5597 if (params.contains("none") || params.contains("null")) store = null;
5698 else if (params.contains("string")) store = new MapLookup<String, Integer>(new HashMap<String, Integer>());
5799 else if (params.contains("utf8") || params.contains("utf16")) {
@@ -74,10 +116,7 @@
75117 }
76118
77119 if (params.contains("fold64")) { //fold into Long
78 - Functor<Long, byte[]> fold;
79 - fold = XorFold64.instance;
80 -
81 - Functor<Long, String> convert = new Functor.Composite<Long, byte[], String>(digest, fold);
 120+ Functor<Long, String> convert = (Functor<Long, String>)newHash(params, lang); //XXX: ugly cast
82121
83122 if (params.contains("primitive")) {
84123 LongIntLookup<Long> numStore = new LongIntLookup<Long>();
@@ -87,13 +126,9 @@
88127 store = new KeyDigestingValueStore<String, Long, Integer>(numStore, convert);
89128 }
90129 } else { //keep bytes, wrap in ByteArray
91 - if (params.contains("wrap8")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(8));
92 - else if (params.contains("wrap6")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(6));
93 - else if (params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4));
94 - else if (params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4));
95 -
 130+
96131 //create converter that includes wrapping the byte array in a ByteString
97 - Functor<ByteString, String> convert = new Functor.Composite<ByteString, byte[], String>(digest, ByteString.wrap);
 132+ Functor<ByteString, String> convert = (Functor<ByteString, String>)newHash(params, lang); //XXX: ugly cast
98133
99134 //set up the store
100135 MapLookup<ByteString, Integer> byteStore = new MapLookup<ByteString, Integer>(new HashMap<ByteString, Integer>());
Index: trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties
@@ -75,11 +75,19 @@
7676 # NOTE: when using this, allow for 116 bytes plus the average size of names per ID entry.
7777 # So if you have anaverage name length of 12 and expect1million entries,
7878 # allow for about 1.3 gigabyte RAM to be used for ID caching.
79 -dbstore.idManager=false
 79+dbstore.idManager=true
8080 #dbstore.auxFileDir defaults to system temp dir
8181 #dbstore.auxFileDir="/tmp"
8282 dbstore.idManager.bufferSize=16384
8383
 84+#idStoreParameters:
 85+# basic: string (default), utf8, or utf16
 86+# for utf8 and utf16: md5, sha1, or huffman (or nothing)
 87+# for utf8 and utf16: wrap8 (wrap to 8 bytes), fold64 (wrap to single long value)
 88+# for fold64: primitive (use gnu trove primitive hash)
 89+# "utf16+md5+fold64+primitive" uses about one third of the memory used by "string"
 90+dbstore.idManager.idStoreParameters="utf16+md5+fold64+primitive"
 91+
8492 ### CycleFinder #####################################
8593 dbstore.CycleFinder.levelWarningThreshold=32
8694 dbstore.CycleFinder.degreeWarningThreshold=1024

Status & tagging log