Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/store/builder/NameHashTrial.java |
— | — | @@ -0,0 +1,57 @@ |
| 2 | +package de.brightbyte.wikiword.store.builder; |
| 3 | + |
| 4 | +import java.io.BufferedReader; |
| 5 | +import java.io.BufferedWriter; |
| 6 | +import java.io.FileInputStream; |
| 7 | +import java.io.FileOutputStream; |
| 8 | +import java.io.IOException; |
| 9 | +import java.io.InputStream; |
| 10 | +import java.io.InputStreamReader; |
| 11 | +import java.io.OutputStream; |
| 12 | +import java.io.OutputStreamWriter; |
| 13 | +import java.io.PrintWriter; |
| 14 | +import java.security.NoSuchAlgorithmException; |
| 15 | + |
| 16 | +import de.brightbyte.data.Functor; |
| 17 | +import de.brightbyte.util.PersistenceException; |
| 18 | +import de.brightbyte.wikiword.builder.NameMaps; |
| 19 | + |
| 20 | +public class NameHashTrial { |
| 21 | + public static void main(String[] args) throws IOException, PersistenceException, NoSuchAlgorithmException, InterruptedException { |
| 22 | + String params = args[0]; |
| 23 | + int limit = Integer.parseInt(args[1]); |
| 24 | + |
| 25 | + Functor<?, String> hash = NameMaps.newHash(params, "en"); |
| 26 | + |
| 27 | + InputStream rawIn = args.length>2 && !args[2].equals("-") ? new FileInputStream(args[2]) : System.in; |
| 28 | + OutputStream rawOut = args.length>3 && !args[3].equals("-") ? new FileOutputStream(args[3]) : System.out; |
| 29 | + |
| 30 | + BufferedReader in = new BufferedReader(new InputStreamReader(rawIn, "UTF-8")); |
| 31 | + PrintWriter out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(rawOut, "UTF-8"))); |
| 32 | + |
| 33 | + long start = System.nanoTime(); |
| 34 | + |
| 35 | + System.out.println("Reading input..."); |
| 36 | + String s; |
| 37 | + int c = 0; |
| 38 | + while ((s = in.readLine()) != null) { |
| 39 | + c++; |
| 40 | + if (c>limit) break; |
| 41 | + |
| 42 | + Object h = hash.apply(s); |
| 43 | + |
| 44 | + out.println(h+"\t"+s); |
| 45 | + if (rawOut==System.out) out.flush(); |
| 46 | + |
| 47 | + if (c % 10000 == 0) System.out.format(" at %d\n", c); |
| 48 | + } |
| 49 | + |
| 50 | + if (rawOut!=System.out) out.close(); |
| 51 | + else out.flush(); |
| 52 | + |
| 53 | + if (rawIn!=System.in) in.close(); |
| 54 | + |
| 55 | + long t = System.nanoTime() - start; |
| 56 | + System.out.format("Processed %d entries in %01.3f sec\n", c, t/1000000000.0); |
| 57 | + } |
| 58 | +} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/NameMaps.java |
— | — | @@ -1,6 +1,5 @@ |
2 | 2 | package de.brightbyte.wikiword.builder; |
3 | 3 | |
4 | | -import java.io.File; |
5 | 4 | import java.io.IOException; |
6 | 5 | import java.io.UnsupportedEncodingException; |
7 | 6 | import java.net.URL; |
— | — | @@ -8,12 +7,8 @@ |
9 | 8 | import java.util.Arrays; |
10 | 9 | import java.util.HashMap; |
11 | 10 | import java.util.HashSet; |
12 | | -import java.util.Map; |
13 | 11 | import java.util.Set; |
14 | 12 | |
15 | | -import org.ardverk.collection.PatriciaTrie; |
16 | | -import org.ardverk.collection.StringKeyAnalyzer; |
17 | | - |
18 | 13 | import de.brightbyte.data.BlockDigest; |
19 | 14 | import de.brightbyte.data.ByteString; |
20 | 15 | import de.brightbyte.data.Codec; |
— | — | @@ -22,7 +17,6 @@ |
23 | 18 | import de.brightbyte.data.KeyValueStore; |
24 | 19 | import de.brightbyte.data.LongIntLookup; |
25 | 20 | import de.brightbyte.data.MapLookup; |
26 | | -import de.brightbyte.data.XorFold32; |
27 | 21 | import de.brightbyte.data.XorFold64; |
28 | 22 | import de.brightbyte.data.XorWrap; |
29 | 23 | import de.brightbyte.io.HuffmanDataCodec; |
— | — | @@ -43,14 +37,62 @@ |
44 | 38 | return new HashMap<String, V>(); |
45 | 39 | }*/ |
46 | 40 | |
| 41 | + protected static Set<String> parseParams(String spec) { |
| 42 | + String[] tt = spec.split("[,;|+/ &]+"); |
| 43 | + |
| 44 | + Set<String> params = new HashSet<String>(); |
| 45 | + params.addAll(Arrays.asList(tt)); |
| 46 | + |
| 47 | + return params; |
| 48 | + } |
| 49 | + |
| 50 | + public static Functor<?, String> newHash(String params, String lang) { |
| 51 | + return newHash(parseParams(params), lang); |
| 52 | + } |
| 53 | + |
| 54 | + public static Functor<?, String> newHash(Set<String> params, String lang) { |
| 55 | + //initial digest turns string into UTF-8 bytes |
| 56 | + Functor<byte[], String> digest; |
| 57 | + |
| 58 | + try { |
| 59 | + if (params.contains("utf8")) digest = new Codec.Encoder<String, byte[]>(new CharsetCodec("UTF-8")); |
| 60 | + else digest = new Codec.Encoder<String, byte[]>(new CharsetCodec("UTF-16")); |
| 61 | + |
| 62 | + //apply md5 digest or huffman compression |
| 63 | + if (params.contains("md5")) digest = new Functor.Composite<byte[], byte[], String>(digest, new BlockDigest("MD5")); |
| 64 | + else if (params.contains("sha1")) digest = new Functor.Composite<byte[], byte[], String>(digest, new BlockDigest("SHA-1")); |
| 65 | + else if (params.contains("huff") || params.contains("huffman")) digest = new Functor.Composite<byte[], byte[], String>(digest, getHuffmanEncoder(lang)); |
| 66 | + } catch (UnsupportedEncodingException e) { |
| 67 | + throw new IllegalArgumentException(e); |
| 68 | + } catch (NoSuchAlgorithmException e) { |
| 69 | + throw new IllegalArgumentException(e); |
| 70 | + } catch (IOException e) { |
| 71 | + throw new RuntimeException(e); |
| 72 | + } |
| 73 | + |
| 74 | + if (params.contains("fold64")) { //fold into Long |
| 75 | + Functor<Long, byte[]> fold; |
| 76 | + fold = XorFold64.instance; |
| 77 | + |
| 78 | + Functor<Long, String> convert = new Functor.Composite<Long, byte[], String>(digest, fold); |
| 79 | + return convert; |
| 80 | + } else { //keep bytes, wrap in ByteArray |
| 81 | + if (params.contains("wrap8")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(8)); |
| 82 | + else if (params.contains("wrap6")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(6)); |
| 83 | + else if (params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4)); |
| 84 | + else if (params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4)); |
| 85 | + |
| 86 | + //create converter that includes wrapping the byte array in a ByteString |
| 87 | + Functor<ByteString, String> convert = new Functor.Composite<ByteString, byte[], String>(digest, ByteString.wrap); |
| 88 | + return convert; |
| 89 | + } |
| 90 | + } |
| 91 | + |
47 | 92 | public static KeyValueStore<String, Integer> newStore(String storeParams, String lang) { |
48 | 93 | KeyValueStore<String, Integer> store = null; |
49 | 94 | |
50 | | - String[] tt = storeParams.split("[,;|+/ &]+"); |
| 95 | + Set<String> params = parseParams(storeParams); |
51 | 96 | |
52 | | - Set<String> params = new HashSet<String>(); |
53 | | - params.addAll(Arrays.asList(tt)); |
54 | | - |
55 | 97 | if (params.contains("none") || params.contains("null")) store = null; |
56 | 98 | else if (params.contains("string")) store = new MapLookup<String, Integer>(new HashMap<String, Integer>()); |
57 | 99 | else if (params.contains("utf8") || params.contains("utf16")) { |
— | — | @@ -74,10 +116,7 @@ |
75 | 117 | } |
76 | 118 | |
77 | 119 | if (params.contains("fold64")) { //fold into Long |
78 | | - Functor<Long, byte[]> fold; |
79 | | - fold = XorFold64.instance; |
80 | | - |
81 | | - Functor<Long, String> convert = new Functor.Composite<Long, byte[], String>(digest, fold); |
| 120 | + Functor<Long, String> convert = (Functor<Long, String>)newHash(params, lang); //XXX: ugly cast |
82 | 121 | |
83 | 122 | if (params.contains("primitive")) { |
84 | 123 | LongIntLookup<Long> numStore = new LongIntLookup<Long>(); |
— | — | @@ -87,13 +126,9 @@ |
88 | 127 | store = new KeyDigestingValueStore<String, Long, Integer>(numStore, convert); |
89 | 128 | } |
90 | 129 | } else { //keep bytes, wrap in ByteArray |
91 | | - if (params.contains("wrap8")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(8)); |
92 | | - else if (params.contains("wrap6")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(6)); |
93 | | - else if (params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4)); |
94 | | - else if (params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4)); |
95 | | - |
| 130 | + |
96 | 131 | //create converter that includes wrapping the byte array in a ByteString |
97 | | - Functor<ByteString, String> convert = new Functor.Composite<ByteString, byte[], String>(digest, ByteString.wrap); |
| 132 | + Functor<ByteString, String> convert = (Functor<ByteString, String>)newHash(params, lang); //XXX: ugly cast |
98 | 133 | |
99 | 134 | //set up the store |
100 | 135 | MapLookup<ByteString, Integer> byteStore = new MapLookup<ByteString, Integer>(new HashMap<ByteString, Integer>()); |
Index: trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties |
— | — | @@ -75,11 +75,19 @@ |
76 | 76 | # NOTE: when using this, allow for 116 bytes plus the average size of names per ID entry. |
77 | 77 | # So if you have anaverage name length of 12 and expect1million entries, |
78 | 78 | # allow for about 1.3 gigabyte RAM to be used for ID caching. |
79 | | -dbstore.idManager=false |
| 79 | +dbstore.idManager=true |
80 | 80 | #dbstore.auxFileDir defaults to system temp dir |
81 | 81 | #dbstore.auxFileDir="/tmp" |
82 | 82 | dbstore.idManager.bufferSize=16384 |
83 | 83 | |
| 84 | +#idStoreParameters: |
| 85 | +# basic: string (default), utf8, or utf16 |
| 86 | +# for utf8 and utf16: md5, sha1, or huffman (or nothing) |
| 87 | +# for utf8 and utf16: wrap8 (wrap to 8 bytes), fold64 (wrap to single long value) |
| 88 | +# for fold64: primitive (use gnu trove primitive hash) |
| 89 | +# "utf16+md5+fold64+primitive" uses about one third of the memory used by "string" |
| 90 | +dbstore.idManager.idStoreParameters="utf16+md5+fold64+primitive" |
| 91 | + |
84 | 92 | ### CycleFinder ##################################### |
85 | 93 | dbstore.CycleFinder.levelWarningThreshold=32 |
86 | 94 | dbstore.CycleFinder.degreeWarningThreshold=1024 |