Index: trunk/WikiWord/CatGraph/.classpath |
— | — | @@ -12,5 +12,8 @@ |
13 | 13 | </classpathentry> |
14 | 14 | <classpathentry kind="var" path="M2_REPO/org/apache/geronimo/specs/geronimo-jta_1.1_spec/1.1.1/geronimo-jta_1.1_spec-1.1.1.jar"/> |
15 | 15 | <classpathentry kind="var" path="M2_REPO/org/neo4j/neo4j-index/1.0/neo4j-index-1.0.jar"/> |
| 16 | + <classpathentry kind="var" path="M2_REPO/org/apache/lucene/lucene-core/2.9.1/lucene-core-2.9.1.jar"/> |
| 17 | + <classpathentry kind="var" path="M2_REPO/org/neo4j/neo4j-commons/1.0/neo4j-commons-1.0.jar"/> |
| 18 | + <classpathentry kind="var" path="M2_REPO/mysql/mysql-connector-java/3.1.11/mysql-connector-java-3.1.11.jar"/> |
16 | 19 | <classpathentry kind="output" path="bin"/> |
17 | 20 | </classpath> |
Index: trunk/WikiWord/CatGraph/src/main/java/de/wikimedia/catgraph/CatGraph.java |
— | — | @@ -1,9 +1,17 @@ |
2 | 2 | package de.wikimedia.catgraph; |
3 | 3 | |
4 | 4 | import java.io.File; |
| 5 | +import java.io.FileInputStream; |
5 | 6 | import java.io.IOException; |
| 7 | +import java.io.InputStreamReader; |
| 8 | +import java.net.URL; |
6 | 9 | import java.sql.ResultSet; |
7 | 10 | import java.sql.SQLException; |
| 11 | +import java.util.ArrayList; |
| 12 | +import java.util.Collection; |
| 13 | +import java.util.Iterator; |
| 14 | +import java.util.List; |
| 15 | +import java.util.Map; |
8 | 16 | |
9 | 17 | import org.neo4j.graphdb.Direction; |
10 | 18 | import org.neo4j.graphdb.GraphDatabaseService; |
— | — | @@ -18,12 +26,36 @@ |
19 | 27 | import org.neo4j.index.lucene.LuceneIndexService; |
20 | 28 | import org.neo4j.kernel.EmbeddedGraphDatabase; |
21 | 29 | |
| 30 | +import de.brightbyte.application.Arguments; |
| 31 | +import de.brightbyte.application.ConsoleApp; |
| 32 | +import de.brightbyte.audit.DebugUtil; |
| 33 | +import de.brightbyte.data.Pair; |
| 34 | +import de.brightbyte.data.cursor.DataCursor; |
22 | 35 | import de.brightbyte.db.DatabaseAccess; |
23 | | -import de.brightbyte.db.DatabaseConnectionInfo; |
24 | | -import de.brightbyte.db.DatabaseSchema; |
25 | | -import de.brightbyte.io.IOUtil; |
| 36 | +import de.brightbyte.db.DatabaseUtil; |
| 37 | +import de.brightbyte.io.ChunkingCursor; |
| 38 | +import de.brightbyte.io.LineCursor; |
| 39 | +import de.brightbyte.io.Output; |
| 40 | +import de.brightbyte.job.ChunkedProgressRateTracker; |
| 41 | +import de.brightbyte.text.CsvLineChunker; |
| 42 | +import de.brightbyte.util.CollectionUtils; |
| 43 | +import de.brightbyte.util.PersistenceException; |
| 44 | +import de.brightbyte.util.SystemUtils; |
26 | 45 | |
27 | | -public class CatGraph { |
| 46 | +public class CatGraph extends ConsoleApp { |
| 47 | + protected class Descendants implements Command { |
| 48 | + |
| 49 | + private int start; |
| 50 | + |
| 51 | + public Descendants(int start) { |
| 52 | + this.start = start; |
| 53 | + } |
| 54 | + |
| 55 | + public void execute(ConsoleApp app) throws Exception { |
| 56 | + listDescendants(start, out); |
| 57 | + } |
| 58 | + } |
| 59 | + |
28 | 60 | public enum CategoryRelationships implements RelationshipType |
29 | 61 | { |
30 | 62 | CONTAINS |
— | — | @@ -37,16 +69,74 @@ |
38 | 70 | this.indexer = indexer; |
39 | 71 | } |
40 | 72 | |
41 | | - public void load(DatabaseAccess db, String sql) throws SQLException { |
| 73 | + public void loadArcs(DatabaseAccess db, String sql, int fromCol, int toCol) throws SQLException { |
42 | 74 | ResultSet rs = db.executeQuery("load graph", sql); |
43 | 75 | while (rs.next()) { |
44 | | - int from = rs.getInt(1); |
45 | | - int to = rs.getInt(2); |
| 76 | + int from = rs.getInt(fromCol); |
| 77 | + int to = rs.getInt(toCol); |
46 | 78 | |
47 | 79 | putArc(from ,to); |
48 | 80 | } |
49 | 81 | } |
| 82 | + |
| 83 | + public void loadArcs(DataCursor<? extends List<?>> args, int fromCol, int toCol) throws PersistenceException { |
| 84 | + ChunkedProgressRateTracker progressTracker = new ChunkedProgressRateTracker("arcs"); |
| 85 | + |
| 86 | + List<?> row ; |
| 87 | + while ((row = args.next()) != null) { |
| 88 | + int from = DatabaseUtil.asInt( row.get(fromCol) ); |
| 89 | + int to = DatabaseUtil.asInt( row.get(toCol) ); |
| 90 | + |
| 91 | + putArc(from ,to); |
| 92 | + |
| 93 | + progressTracker.step(); |
| 94 | + if ( progressTracker.chunkIf(10000, 10) ) { |
| 95 | + out.println(progressTracker); |
| 96 | + } |
| 97 | + } |
| 98 | + } |
| 99 | + |
| 100 | + public void loadArcs(DataCursor<Pair<Integer, Integer>> args) throws PersistenceException { |
| 101 | + ChunkedProgressRateTracker progressTracker = new ChunkedProgressRateTracker("arcs"); |
| 102 | + |
| 103 | + Pair<Integer, Integer> row ; |
| 104 | + while ((row = args.next()) != null) { |
| 105 | + int from = row.getA(); |
| 106 | + int to = row.getB(); |
| 107 | + |
| 108 | + putArc(from ,to); |
| 109 | + |
| 110 | + progressTracker.step(); |
| 111 | + if ( progressTracker.chunkIf(10000, 10) ) { |
| 112 | + out.println(progressTracker); |
| 113 | + } |
| 114 | + } |
| 115 | + } |
| 116 | + |
| 117 | + public void loadRoots(DatabaseAccess db, String sql) throws SQLException { |
| 118 | + ResultSet rs = db.executeQuery("load graph", sql); |
| 119 | + while (rs.next()) { |
| 120 | + int root = rs.getInt(1); |
| 121 | + |
| 122 | + putRoot(root); |
| 123 | + } |
| 124 | + } |
50 | 125 | |
| 126 | + public Node findRootNode(int pageId) { |
| 127 | + Node n = getNodeByPageId(pageId); |
| 128 | + if ( n == null ) throw new IllegalArgumentException("page_id "+pageId+" not found"); |
| 129 | + |
| 130 | + return findRootNode(n); |
| 131 | + } |
| 132 | + |
| 133 | + public Node findRootNode(Node n) { |
| 134 | + Iterable<Relationship> relationships = n.getRelationships(CategoryRelationships.CONTAINS, Direction.INCOMING); |
| 135 | + Iterator<Relationship> it = relationships.iterator(); |
| 136 | + |
| 137 | + if (it.hasNext()) return findRootNode(it.next().getStartNode()); |
| 138 | + else return n; |
| 139 | + } |
| 140 | + |
51 | 141 | public Node getNodeByPageId(int pageId) { |
52 | 142 | return indexer.getSingleNode("page_id", pageId); |
53 | 143 | } |
— | — | @@ -64,46 +154,128 @@ |
65 | 155 | } |
66 | 156 | |
67 | 157 | public Relationship putArc(int from, int cat) { |
| 158 | + if ( from == cat ) return null; |
68 | 159 | return putArc( aquireNodeByPageId(from), aquireNodeByPageId(cat) ); |
69 | 160 | } |
70 | 161 | |
| 162 | + public Relationship putRoot(int root) { |
| 163 | + return putRoot( aquireNodeByPageId(root) ); |
| 164 | + } |
| 165 | + |
71 | 166 | public Relationship putArc(Node from, Node cat) { |
| 167 | + if ( from.getId() == cat.getId() ) return null; |
72 | 168 | Relationship relationship = cat.createRelationshipTo( from, CategoryRelationships.CONTAINS ); |
73 | 169 | return relationship; |
74 | 170 | } |
75 | 171 | |
| 172 | + public Relationship putRoot(Node root) { |
| 173 | + Node ref = graphDb.getReferenceNode(); |
| 174 | + if (ref.getId() == root.getId()) return null; |
| 175 | + |
| 176 | + Relationship relationship = ref.createRelationshipTo( root, CategoryRelationships.CONTAINS ); |
| 177 | + return relationship; |
| 178 | + } |
| 179 | + |
| 180 | + public Collection<Integer> getDescendants(int start) { |
| 181 | + Node n = getNodeByPageId(start); |
| 182 | + if ( n == null ) throw new IllegalArgumentException("page_id "+start+" not found"); |
| 183 | + |
| 184 | + return getDescendants(n); |
| 185 | + } |
| 186 | + |
| 187 | + public Collection<Integer> getDescendants(Node startNode) { |
| 188 | + List<Integer> descendants = new ArrayList<Integer>(); |
| 189 | + |
| 190 | + Traverser traverser = startNode.traverse( Traverser.Order.BREADTH_FIRST , StopEvaluator.END_OF_GRAPH, ReturnableEvaluator.ALL, CategoryRelationships.CONTAINS, Direction.OUTGOING ); |
| 191 | + for ( Node node : traverser ) |
| 192 | + { |
| 193 | + if ( node.hasProperty("page_id") ) |
| 194 | + descendants.add((Integer)node.getProperty("page_id")); |
| 195 | + } |
| 196 | + |
| 197 | + return descendants; |
| 198 | + } |
| 199 | + |
76 | 200 | public void traverseAndDump(Node startNode) { |
77 | 201 | Traverser traverser = startNode.traverse( Traverser.Order.BREADTH_FIRST , StopEvaluator.END_OF_GRAPH, ReturnableEvaluator.ALL, CategoryRelationships.CONTAINS, Direction.OUTGOING ); |
78 | 202 | for ( Node node : traverser ) |
79 | 203 | { |
80 | | - System.out.println( node ); |
| 204 | + if ( node.hasProperty("page_id") ) |
| 205 | + System.out.println( "page #" + node.getProperty("page_id") ); |
| 206 | + else |
| 207 | + System.out.println( node.toString() ); |
81 | 208 | } |
82 | 209 | } |
83 | 210 | |
84 | | - public static void main(String[] args) throws IOException, SQLException { |
85 | | - GraphDatabaseService graphDb = new EmbeddedGraphDatabase( args[0] ); |
86 | | - DatabaseConnectionInfo dbInfo = new DatabaseConnectionInfo( new File(args[1]) ); |
87 | | - String sql = IOUtil.slurp(new File(args[2]), "UTF-8"); |
| 211 | + /* |
| 212 | + protected static final String catlinksSql = "select cl_from, page_id from categorylinks join page on cl_to = page_title and page_namespace = 14"; |
| 213 | + protected static final String rootCatsSql = "select page_id from page left join categorylinks on cl_from = page_id where cl_from is null and page_namespace = 14"; |
| 214 | + */ |
| 215 | + |
| 216 | + public void listDescendants(int start, Output out) { |
| 217 | + out.println("finding descendants of "+start+"...."); |
| 218 | + long t = System.currentTimeMillis(); |
| 219 | + Collection<Integer> descendants = getDescendants(start); |
| 220 | + out.println("finding descendants of "+start+" took "+(System.currentTimeMillis() - t)+"ms."); |
| 221 | + |
| 222 | + out.println("-----------------------------"); |
| 223 | + DebugUtil.dump("", descendants, out); |
| 224 | + out.println("-----------------------------"); |
| 225 | + } |
| 226 | + |
| 227 | + protected Command newCommand(String cmd, List<Object> args) { |
| 228 | + Command command = super.newCommand(cmd, args); |
| 229 | + if (command!=null) return command; |
88 | 230 | |
| 231 | + if (cmd.equals("descendants") || cmd.equals("desc") || cmd.equals("d")) return new Descendants(DatabaseUtil.asInt(args.get(0))); |
| 232 | + else return null; |
| 233 | + } |
| 234 | + |
| 235 | + public static void main(String[] argv) throws IOException, SQLException, PersistenceException { |
| 236 | + Arguments args = new Arguments(); |
| 237 | + args.parse(argv); |
| 238 | + |
| 239 | + Map<String,String> configuration = null; |
| 240 | + |
| 241 | + if ( args.isSet("config") ) { |
| 242 | + configuration = EmbeddedGraphDatabase.loadConfigurations( args.getOption("config", (String)null) ); |
| 243 | + } else { |
| 244 | + URL u = CatGraph.class.getResource("neo4j.properties"); |
| 245 | + configuration = CollectionUtils.asMap( SystemUtils.loadProperties(u, null) ); |
| 246 | + } |
| 247 | + |
| 248 | + GraphDatabaseService graphDb = new EmbeddedGraphDatabase( args.getParameter(0), configuration ); |
| 249 | + File tsv = new File(args.getParameter(1)); |
| 250 | + |
89 | 251 | IndexService indexer = new LuceneIndexService(graphDb); |
| 252 | + |
| 253 | + /* |
| 254 | + DatabaseAccess db = new DatabaseSchema(null, dbInfo, null); |
| 255 | + db.open(); |
90 | 256 | |
91 | | - DatabaseAccess db = new DatabaseSchema(null, dbInfo, null); |
92 | | - |
| 257 | + db.executeUpdate("", "use "+database+";"); |
| 258 | + */ |
93 | 259 | CatGraph graph = new CatGraph(graphDb, indexer); |
94 | 260 | |
| 261 | + InputStreamReader rd = new InputStreamReader(new FileInputStream(tsv)); |
| 262 | + ChunkingCursor cursor = new ChunkingCursor(new LineCursor(rd), CsvLineChunker.tsv); |
| 263 | + |
| 264 | + cursor.next(); //skip header in first line |
| 265 | + |
95 | 266 | Transaction tx = graphDb.beginTx(); |
96 | 267 | try |
97 | 268 | { |
98 | | - |
99 | | - graph.load(db, sql); |
100 | | - |
101 | | - graph.traverseAndDump(graphDb.getReferenceNode()); |
102 | | - |
103 | | - tx.success(); |
| 269 | + System.out.println("loading arcs...."); |
| 270 | + long t = System.currentTimeMillis(); |
| 271 | + graph.loadArcs(cursor, 0, 1); |
| 272 | + System.out.println("loading arcs took "+(System.currentTimeMillis() - t)+"ms."); |
| 273 | + |
| 274 | + graph.run(); |
104 | 275 | } |
105 | 276 | finally |
106 | 277 | { |
107 | 278 | tx.finish(); |
| 279 | + graphDb.shutdown(); |
108 | 280 | } |
109 | 281 | |
110 | 282 | System.out.println( "done" ); |
Index: trunk/WikiWord/CatGraph/pom.xml |
— | — | @@ -51,11 +51,6 @@ |
52 | 52 | </dependency>
|
53 | 53 | <dependency>
|
54 | 54 | <groupId>org.neo4j</groupId>
|
55 | | - <artifactId>neo4j-lucene-index</artifactId>
|
56 | | - <version>1.0</version>
|
57 | | - </dependency>
|
58 | | - <dependency>
|
59 | | - <groupId>org.neo4j</groupId>
|
60 | 55 | <artifactId>neo4j-shell</artifactId>
|
61 | 56 | <version>1.0</version>
|
62 | 57 | </dependency>
|
— | — | @@ -64,21 +59,6 @@ |
65 | 60 | <artifactId>neo4j-utils</artifactId>
|
66 | 61 | <version>1.0</version>
|
67 | 62 | </dependency>
|
68 | | - <dependency>
|
69 | | - <groupId>org.neo4j</groupId>
|
70 | | - <artifactId>neo4j-rest</artifactId>
|
71 | | - <version>1.0</version>
|
72 | | - </dependency>
|
73 | | - <dependency>
|
74 | | - <groupId>org.neo4j</groupId>
|
75 | | - <artifactId>neo4j-traversal</artifactId>
|
76 | | - <version>1.0</version>
|
77 | | - </dependency>
|
78 | | - <dependency>
|
79 | | - <groupId>org.neo4j</groupId>
|
80 | | - <artifactId>neo4j-graph-algo</artifactId>
|
81 | | - <version>1.0</version>
|
82 | | - </dependency>
|
83 | 63 | </dependencies>
|
84 | 64 |
|
85 | 65 | <build>
|