Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/schema/ProximityStoreSchema.java |
— | — | @@ -43,18 +43,21 @@ |
44 | 44 | * concepts the relation applies to. That is, A's in_degree is the size of the set of all Bs for which in(A, B) applies. |
45 | 45 | * This bias is combined with the bias coefficient for a given relation to form the effective bias for that relation, e.g.: |
46 | 46 | * <tt>in_effective_bias(B) = 1 - ( ( 1 - in_bias(B) ) * in_bias_coef )</tt> which amounts to <tt>1 - ( ( log(in_degree(B)) / log(number_of_concepts) ) * in_bias_coef )</tt>. |
47 | | - * For each relation, there's also weight factor provided, which is applied to complement's bias. So if in(A, B) applies, in_w(A, B) is given by: |
48 | | - * <tt>in_weight_factor * out_effective_bias(B) </tt>; the feature vector for A is then calculated for each feature B as follows: |
49 | | - * <tt>A[B] = w(A,B) = in_w(A, B) + out_w(A, B) + up_w(A, B) + down_w(A, B)</tt>. Note that A[A] = c, where c is the "self-weight", which usually equals 1. |
| 47 | + * For each relation, there's also weight factor provided, which is applied to complement's bias. |
| 48 | + * To calculate the effective weight an association, the effective bias on both "sides" of the relation is combined with the weight factor for that relation. |
| 49 | + * So if in(A, B) applies, in_w(A, B) is given by: <tt>in_weight_factor * in_effective_bias(A) * out_effective_bias(B) </tt>. |
| 50 | + * The feature vector for A is then calculated for each feature B as follows: |
| 51 | + * <tt>A[B] = w(A,B) = in_w(A, B) + out_w(A, B) + up_w(A, B) + down_w(A, B)</tt>. Note that A[A] = c, where c is the "self-weight", which usually equals 1. |
| 52 | + * Depending on the weight-factors used, the weight function may or may not be symmetric: in_w(A, B) may always be different from in_w(B, A), however, |
| 53 | + * if in_weight_factor = out_weight_factor, then in_w(A, B) = out_w(B, A), and up_w(A, B) = down_w(B, A) if up_weight_factor = down_weight_factor. |
| 54 | + * Thus, w(A,B) = w(B, A) and A[B] = B[A] if in_weight_factor = out_weight_factor and up_weight_factor = down_weight_factor. |
50 | 55 | * </p> |
51 | 56 | * |
52 | | - * <p>The self, weight, the four bias-coeficients and the four weight-factors are the parameters for the feature vector calculation. |
| 57 | + * <p>The self-weight, the four bias-coeficients and the four weight-factors are the parameters for the feature vector calculation. |
53 | 58 | * They can be tweaked to adjust the relative weight given to the different types of relations in the thesaurus with respect to determining the semantic proximity, |
54 | 59 | * that is, the thematic similarity, of concepts. E.g. having similar incoming links (i.e. frequent co-occurrance of references) is a stringer indicator |
55 | 60 | * of similarity than common outgoing links. |
56 | 61 | * </p> |
57 | | - * |
58 | | - * <p>Note that as a result of the rules above, the weight of the association is not symmetrical: w(A, B) may be different from w(B,A)</p> |
59 | 62 | * |
60 | 63 | * <h4>Table <tt>proximity</tt></h4> |
61 | 64 | * <p>Holds statistical figures relating to the entire thesaurus.</p> |
— | — | @@ -63,8 +66,10 @@ |
64 | 67 | * <dt>concept2</dt><dd>The second concept. Comprises a unique key together with concept1.</dd> |
65 | 68 | * <dt>proximity</dt><dd>The semantic proximity of concept1 and concept2. This is given by the scalar products |
66 | 69 | * of the normalized feature vectors of concept1 and concept2, as stored in the feature table. |
| 70 | + * This can be interpreted as the cosin of the angle between the concepts' feature vectors. |
67 | 71 | * Entries with a low proximity value may be omitted (subject to tweak value <tt>proximity.threshold</tt>).</dd> |
68 | 72 | * </dl> |
| 73 | + * <p>Note that the proximity relation is symmetrical, i.e. prox(A, B) = prox(B, A), regardless if the weight factors used.</p> |
69 | 74 | * |
70 | 75 | * @author daniel |
71 | 76 | */ |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/BuildProximity.java |
— | — | @@ -44,7 +44,7 @@ |
45 | 45 | this.proximityStore.buildProximity(); |
46 | 46 | |
47 | 47 | section("-- statistics --------------------------------------------------"); |
48 | | - conceptStore.getConceptStore().getStatisticsStore().dumpStatistics(getLogOutput()); |
| 48 | + conceptStore.getProximityStoreBuilder().dumpTableStats(out); |
49 | 49 | } |
50 | 50 | |
51 | 51 | public static void main(String[] argv) throws Exception { |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/DatabaseProximityStoreBuilder.java |
— | — | @@ -12,6 +12,7 @@ |
13 | 13 | import de.brightbyte.db.RelationTable; |
14 | 14 | import de.brightbyte.util.PersistenceException; |
15 | 15 | import de.brightbyte.wikiword.TweakSet; |
| 16 | +import de.brightbyte.wikiword.processor.ImportProgressTracker; |
16 | 17 | import de.brightbyte.wikiword.schema.ProximityStoreSchema; |
17 | 18 | import de.brightbyte.wikiword.schema.WikiWordConceptStoreSchema; |
18 | 19 | |
— | — | @@ -47,29 +48,29 @@ |
48 | 49 | proximityThreshold = tweaks.getTweak("proximity.threshold", 0.15); |
49 | 50 | } |
50 | 51 | |
| 52 | + private static String getBiasFormula(String biasField, double biasCoef) { |
| 53 | + if ( biasField == null || biasCoef <= 0) return "1"; |
| 54 | + else if (biasCoef==1) return biasField; |
| 55 | + else if (biasCoef>1) throw new IllegalArgumentException("biasCoef must not be greater than 1"); |
| 56 | + else return "( 1 - ( ( 1 - "+biasField+" ) * "+biasCoef+" ) ) "; |
| 57 | + } |
| 58 | + |
51 | 59 | /** |
52 | 60 | * Builds feature vectors. For a specification, refer to ProximityStoreSchema |
53 | 61 | */ |
54 | | - protected int buildFeatures(DatabaseTable t, String conceptField, String featureField, String suffix, double w, String biasField, double biasCoef) throws PersistenceException { |
| 62 | + protected int buildFeatures(DatabaseTable t, String conceptField, String featureField, String suffix, double w, String baseBiasField, double baseBiasCoef, String targetBiasField, double targetBiasCoef) throws PersistenceException { |
55 | 63 | if (!conceptStore.areStatsComplete()) throw new IllegalStateException("statistics need to be built before concept infos!"); |
56 | 64 | |
57 | 65 | String v = ""+w; |
| 66 | + if (baseBiasField!=null && baseBiasCoef>0) v = getBiasFormula("B."+baseBiasField, baseBiasCoef) + " * " + v; |
| 67 | + if (targetBiasField!=null && targetBiasCoef>0) v = getBiasFormula("D."+targetBiasField, targetBiasCoef) + " * " + v; |
58 | 68 | |
59 | | - //NOTE: conider bias of reference target |
60 | | - //FIXME: also consider local (outgoing) bias? feature vectors will be normalized, so that's not so relevant maybe? |
61 | | - //NOTE: since there are usually more link than categories, there's a bias in favor of categories! |
62 | | - // number of links grows with article length, number of categories does not! |
63 | | - if (biasField!=null && biasCoef>0) { |
64 | | - if (biasCoef==1) v = "D."+biasField+" * "+w; |
65 | | - else if (biasCoef>1) throw new IllegalArgumentException("biasCoef must not be greater than 1"); |
66 | | - else v = "( 1 - ( ( 1 - D."+biasField+" ) * "+biasCoef+" ) ) * "+w; |
67 | | - } |
68 | | - |
69 | 69 | DatabaseTable degreeTable = conceptStore.getStatisticsStoreBuilder().getDatabaseAccess().getTable("degree"); |
70 | 70 | |
71 | 71 | String sql = "INSERT INTO "+featureTable.getSQLName()+" (concept, feature, total_weight) "; |
72 | 72 | sql += " SELECT T."+conceptField+", T."+featureField+", "+v+" FROM "+t.getSQLName()+" as T "; |
73 | | - if (biasField!=null && biasCoef!=0) sql += " JOIN "+degreeTable.getSQLName()+" as D ON T."+featureField+" = D.concept "; |
| 73 | + if (baseBiasField!=null && baseBiasCoef!=0) sql += " JOIN "+degreeTable.getSQLName()+" as B ON T."+conceptField+" = B.concept "; |
| 74 | + if (targetBiasField!=null && targetBiasCoef!=0) sql += " JOIN "+degreeTable.getSQLName()+" as D ON T."+featureField+" = D.concept "; |
74 | 75 | |
75 | 76 | if (suffix!=null) sql += " "+suffix+" "; |
76 | 77 | |
— | — | @@ -97,22 +98,22 @@ |
98 | 99 | } |
99 | 100 | |
100 | 101 | if (beginTask("buildFeatures", "feature#down")) { |
101 | | - int n = buildFeatures(broaderTable, "broad", "narrow", null, featureVectorFactors.downWeight, "up_bias", featureVectorFactors.downBiasCoef); |
| 102 | + int n = buildFeatures(broaderTable, "broad", "narrow", null, featureVectorFactors.downWeight, "down_bias", featureVectorFactors.downBiasCoef, "up_bias", featureVectorFactors.upBiasCoef); |
102 | 103 | endTask("buildFeatures", "feature#down", n+" entries"); |
103 | 104 | } |
104 | 105 | |
105 | 106 | if (beginTask("buildFeatures", "feature#up")) { |
106 | | - int n = buildFeatures(broaderTable, "narrow", "broad", null, featureVectorFactors.upWeight, "down_bias", featureVectorFactors.upBiasCoef); |
| 107 | + int n = buildFeatures(broaderTable, "narrow", "broad", null, featureVectorFactors.upWeight, "up_bias", featureVectorFactors.upBiasCoef, "down_bias", featureVectorFactors.downBiasCoef); |
107 | 108 | endTask("buildFeatures", "feature#up", n+" entries"); |
108 | 109 | } |
109 | 110 | |
110 | 111 | if (beginTask("buildFeatures", "feature#out")) { |
111 | | - int n = buildFeatures(linkTable, "anchor", "target", null, featureVectorFactors.outWeight, "in_bias", featureVectorFactors.outBiasCoef); |
| 112 | + int n = buildFeatures(linkTable, "anchor", "target", null, featureVectorFactors.outWeight, "out_bias", featureVectorFactors.outBiasCoef, "in_bias", featureVectorFactors.inBiasCoef); |
112 | 113 | endTask("buildFeatures", "feature#out", n+" entries"); |
113 | 114 | } |
114 | 115 | |
115 | 116 | if (beginTask("buildFeatures", "feature#in")) { |
116 | | - int n = buildFeatures(linkTable, "target", "anchor", null, featureVectorFactors.inWeight, "out_bias", featureVectorFactors.inBiasCoef); |
| 117 | + int n = buildFeatures(linkTable, "target", "anchor", null, featureVectorFactors.inWeight, "in_bias", featureVectorFactors.inBiasCoef, "out_bias", featureVectorFactors.outBiasCoef); |
117 | 118 | endTask("buildFeatures", "feature#in", n+" entries"); |
118 | 119 | } |
119 | 120 | |
— | — | @@ -183,12 +184,17 @@ |
184 | 185 | protected String name; |
185 | 186 | protected DatabaseTable conceptTable; |
186 | 187 | protected int lastId ; |
| 188 | + |
| 189 | + protected ImportProgressTracker conceptTracker; |
| 190 | + protected ImportProgressTracker featureTracker; |
187 | 191 | |
188 | 192 | public CollectProximityQuery(String context, String name) { |
189 | 193 | super(); |
190 | 194 | this.context = context; |
191 | 195 | this.name = name; |
192 | 196 | this.conceptTable = conceptStore.getDatabaseAccess().getTable("concept"); |
| 197 | + this.conceptTracker = new ImportProgressTracker("concepts"); |
| 198 | + this.featureTracker = new ImportProgressTracker("features"); |
193 | 199 | } |
194 | 200 | |
195 | 201 | public String getChunkField() { |
— | — | @@ -228,12 +234,25 @@ |
229 | 235 | sql += " ORDER BY id ASC"; |
230 | 236 | |
231 | 237 | int n = 0; |
| 238 | + int i = 0; |
232 | 239 | try { |
233 | 240 | ResultSet res = DatabaseProximityStoreBuilder.this.executeQuery(context+"::"+name+"#chunk"+chunk, sql); |
234 | 241 | while (res.next()) { |
235 | 242 | lastId = res.getInt(1); |
236 | 243 | |
237 | | - n+= insertProximity(lastId); //TODO: progress tracker! |
| 244 | + int c = insertProximity(lastId); //TODO: progress tracker! |
| 245 | + n*= c; |
| 246 | + i+= 1; |
| 247 | + |
| 248 | + conceptTracker.step(); |
| 249 | + featureTracker.step(c); |
| 250 | + |
| 251 | + if ( (i % 1000) == 0 ) { |
| 252 | + conceptTracker.chunk(); |
| 253 | + featureTracker.chunk(); |
| 254 | + log("- "+conceptTracker); |
| 255 | + log("- "+featureTracker); |
| 256 | + } |
238 | 257 | } |
239 | 258 | |
240 | 259 | res.close(); |
— | — | @@ -241,6 +260,11 @@ |
242 | 261 | throw new PersistenceException(e); |
243 | 262 | } |
244 | 263 | |
| 264 | + conceptTracker.chunk(); |
| 265 | + featureTracker.chunk(); |
| 266 | + log("- "+conceptTracker); |
| 267 | + log("- "+featureTracker); |
| 268 | + |
245 | 269 | flush(); |
246 | 270 | return n; |
247 | 271 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/ProximityStoreBuilder.java |
— | — | @@ -10,20 +10,20 @@ |
11 | 11 | //NOTE: since there are usually more link than categories, there's a bias in favor of categories! |
12 | 12 | // number of links grows with article length, number of categories does not! |
13 | 13 | |
14 | | - public final double selfWeight = 1; |
| 14 | + public final double selfWeight = 4; |
15 | 15 | //public final double weightOffset = 1; |
16 | 16 | |
17 | | - public final double downWeight = 0.5; //having common children is not very relevant |
18 | | - public final double downBiasCoef = 0; //if a child has many parents doesn't matter |
| 17 | + public final double downWeight = 0.2; //having common children is not very relevant; also, categorization is favored by systemic bias, so tone it down. |
| 18 | + public final double downBiasCoef = 1; //if the parent has many children should be considered |
19 | 19 | |
20 | | - public final double upWeight = 1.2; //having common parents is interesting |
21 | | - public final double upBiasCoef = 1; //if the parent has many children should be considered |
| 20 | + public final double upWeight = 1.2; //having common parents is interesting; note: categorization is favored by systemic bias, but the bias is tuned out here anyway. |
| 21 | + public final double upBiasCoef = 0.1; //if a child has many parents doesn't matter |
22 | 22 | |
23 | | - public final double inWeight = 1.2; //bein referenced from the same place is a string factor |
24 | | - public final double inBiasCoef = 0.2; //if the link's origin contains a lot of links is not so important; note: dampen link bias |
| 23 | + public final double inWeight = 1.5; //bein referenced from the same place is a strong factor |
| 24 | + public final double inBiasCoef = 1; //if the concept is referenced a lot, co-reference becvomes less relevant |
25 | 25 | |
26 | | - public final double outWeight = 0.5; //referencing the same thing isn't so very important |
27 | | - public final double outBiasCoef = 0.5; //if the link's target is used a lot is a major factor; note: dampen link bias |
| 26 | + public final double outWeight = 1.0; //referencing the same thing is a good indicator |
| 27 | + public final double outBiasCoef = 0.2; //if the concept has many outgoing links doesn't matter much |
28 | 28 | } |
29 | 29 | |
30 | 30 | public void buildFeatures() throws PersistenceException; |