r69947 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r69946‎ | r69947 | r69948 >
Date:14:54, 26 July 2010
Author:daniel
Status:deferred
Tags:
Comment:
config, progress tracking
Modified paths:
  • /trunk/WikiWord/CatGraph/.classpath (modified) (history)
  • /trunk/WikiWord/CatGraph/pom.xml (modified) (history)
  • /trunk/WikiWord/CatGraph/src/main/java/de/wikimedia/catgraph/CatGraph.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/CatGraph/.classpath
@@ -12,5 +12,8 @@
1313 </classpathentry>
1414 <classpathentry kind="var" path="M2_REPO/org/apache/geronimo/specs/geronimo-jta_1.1_spec/1.1.1/geronimo-jta_1.1_spec-1.1.1.jar"/>
1515 <classpathentry kind="var" path="M2_REPO/org/neo4j/neo4j-index/1.0/neo4j-index-1.0.jar"/>
 16+ <classpathentry kind="var" path="M2_REPO/org/apache/lucene/lucene-core/2.9.1/lucene-core-2.9.1.jar"/>
 17+ <classpathentry kind="var" path="M2_REPO/org/neo4j/neo4j-commons/1.0/neo4j-commons-1.0.jar"/>
 18+ <classpathentry kind="var" path="M2_REPO/mysql/mysql-connector-java/3.1.11/mysql-connector-java-3.1.11.jar"/>
1619 <classpathentry kind="output" path="bin"/>
1720 </classpath>
Index: trunk/WikiWord/CatGraph/src/main/java/de/wikimedia/catgraph/CatGraph.java
@@ -1,9 +1,17 @@
22 package de.wikimedia.catgraph;
33
44 import java.io.File;
 5+import java.io.FileInputStream;
56 import java.io.IOException;
 7+import java.io.InputStreamReader;
 8+import java.net.URL;
69 import java.sql.ResultSet;
710 import java.sql.SQLException;
 11+import java.util.ArrayList;
 12+import java.util.Collection;
 13+import java.util.Iterator;
 14+import java.util.List;
 15+import java.util.Map;
816
917 import org.neo4j.graphdb.Direction;
1018 import org.neo4j.graphdb.GraphDatabaseService;
@@ -18,12 +26,36 @@
1927 import org.neo4j.index.lucene.LuceneIndexService;
2028 import org.neo4j.kernel.EmbeddedGraphDatabase;
2129
 30+import de.brightbyte.application.Arguments;
 31+import de.brightbyte.application.ConsoleApp;
 32+import de.brightbyte.audit.DebugUtil;
 33+import de.brightbyte.data.Pair;
 34+import de.brightbyte.data.cursor.DataCursor;
2235 import de.brightbyte.db.DatabaseAccess;
23 -import de.brightbyte.db.DatabaseConnectionInfo;
24 -import de.brightbyte.db.DatabaseSchema;
25 -import de.brightbyte.io.IOUtil;
 36+import de.brightbyte.db.DatabaseUtil;
 37+import de.brightbyte.io.ChunkingCursor;
 38+import de.brightbyte.io.LineCursor;
 39+import de.brightbyte.io.Output;
 40+import de.brightbyte.job.ChunkedProgressRateTracker;
 41+import de.brightbyte.text.CsvLineChunker;
 42+import de.brightbyte.util.CollectionUtils;
 43+import de.brightbyte.util.PersistenceException;
 44+import de.brightbyte.util.SystemUtils;
2645
27 -public class CatGraph {
 46+public class CatGraph extends ConsoleApp {
 47+ protected class Descendants implements Command {
 48+
 49+ private int start;
 50+
 51+ public Descendants(int start) {
 52+ this.start = start;
 53+ }
 54+
 55+ public void execute(ConsoleApp app) throws Exception {
 56+ listDescendants(start, out);
 57+ }
 58+ }
 59+
2860 public enum CategoryRelationships implements RelationshipType
2961 {
3062 CONTAINS
@@ -37,16 +69,74 @@
3870 this.indexer = indexer;
3971 }
4072
41 - public void load(DatabaseAccess db, String sql) throws SQLException {
 73+ public void loadArcs(DatabaseAccess db, String sql, int fromCol, int toCol) throws SQLException {
4274 ResultSet rs = db.executeQuery("load graph", sql);
4375 while (rs.next()) {
44 - int from = rs.getInt(1);
45 - int to = rs.getInt(2);
 76+ int from = rs.getInt(fromCol);
 77+ int to = rs.getInt(toCol);
4678
4779 putArc(from ,to);
4880 }
4981 }
 82+
 83+ public void loadArcs(DataCursor<? extends List<?>> args, int fromCol, int toCol) throws PersistenceException {
 84+ ChunkedProgressRateTracker progressTracker = new ChunkedProgressRateTracker("arcs");
 85+
 86+ List<?> row ;
 87+ while ((row = args.next()) != null) {
 88+ int from = DatabaseUtil.asInt( row.get(fromCol) );
 89+ int to = DatabaseUtil.asInt( row.get(toCol) );
 90+
 91+ putArc(from ,to);
 92+
 93+ progressTracker.step();
 94+ if ( progressTracker.chunkIf(10000, 10) ) {
 95+ out.println(progressTracker);
 96+ }
 97+ }
 98+ }
 99+
 100+ public void loadArcs(DataCursor<Pair<Integer, Integer>> args) throws PersistenceException {
 101+ ChunkedProgressRateTracker progressTracker = new ChunkedProgressRateTracker("arcs");
 102+
 103+ Pair<Integer, Integer> row ;
 104+ while ((row = args.next()) != null) {
 105+ int from = row.getA();
 106+ int to = row.getB();
 107+
 108+ putArc(from ,to);
 109+
 110+ progressTracker.step();
 111+ if ( progressTracker.chunkIf(10000, 10) ) {
 112+ out.println(progressTracker);
 113+ }
 114+ }
 115+ }
 116+
 117+ public void loadRoots(DatabaseAccess db, String sql) throws SQLException {
 118+ ResultSet rs = db.executeQuery("load graph", sql);
 119+ while (rs.next()) {
 120+ int root = rs.getInt(1);
 121+
 122+ putRoot(root);
 123+ }
 124+ }
50125
 126+ public Node findRootNode(int pageId) {
 127+ Node n = getNodeByPageId(pageId);
 128+ if ( n == null ) throw new IllegalArgumentException("page_id "+pageId+" not found");
 129+
 130+ return findRootNode(n);
 131+ }
 132+
 133+ public Node findRootNode(Node n) {
 134+ Iterable<Relationship> relationships = n.getRelationships(CategoryRelationships.CONTAINS, Direction.INCOMING);
 135+ Iterator<Relationship> it = relationships.iterator();
 136+
 137+ if (it.hasNext()) return findRootNode(it.next().getStartNode());
 138+ else return n;
 139+ }
 140+
51141 public Node getNodeByPageId(int pageId) {
52142 return indexer.getSingleNode("page_id", pageId);
53143 }
@@ -64,46 +154,128 @@
65155 }
66156
67157 public Relationship putArc(int from, int cat) {
 158+ if ( from == cat ) return null;
68159 return putArc( aquireNodeByPageId(from), aquireNodeByPageId(cat) );
69160 }
70161
 162+ public Relationship putRoot(int root) {
 163+ return putRoot( aquireNodeByPageId(root) );
 164+ }
 165+
71166 public Relationship putArc(Node from, Node cat) {
 167+ if ( from.getId() == cat.getId() ) return null;
72168 Relationship relationship = cat.createRelationshipTo( from, CategoryRelationships.CONTAINS );
73169 return relationship;
74170 }
75171
 172+ public Relationship putRoot(Node root) {
 173+ Node ref = graphDb.getReferenceNode();
 174+ if (ref.getId() == root.getId()) return null;
 175+
 176+ Relationship relationship = ref.createRelationshipTo( root, CategoryRelationships.CONTAINS );
 177+ return relationship;
 178+ }
 179+
 180+ public Collection<Integer> getDescendants(int start) {
 181+ Node n = getNodeByPageId(start);
 182+ if ( n == null ) throw new IllegalArgumentException("page_id "+start+" not found");
 183+
 184+ return getDescendants(n);
 185+ }
 186+
 187+ public Collection<Integer> getDescendants(Node startNode) {
 188+ List<Integer> descendants = new ArrayList<Integer>();
 189+
 190+ Traverser traverser = startNode.traverse( Traverser.Order.BREADTH_FIRST , StopEvaluator.END_OF_GRAPH, ReturnableEvaluator.ALL, CategoryRelationships.CONTAINS, Direction.OUTGOING );
 191+ for ( Node node : traverser )
 192+ {
 193+ if ( node.hasProperty("page_id") )
 194+ descendants.add((Integer)node.getProperty("page_id"));
 195+ }
 196+
 197+ return descendants;
 198+ }
 199+
76200 public void traverseAndDump(Node startNode) {
77201 Traverser traverser = startNode.traverse( Traverser.Order.BREADTH_FIRST , StopEvaluator.END_OF_GRAPH, ReturnableEvaluator.ALL, CategoryRelationships.CONTAINS, Direction.OUTGOING );
78202 for ( Node node : traverser )
79203 {
80 - System.out.println( node );
 204+ if ( node.hasProperty("page_id") )
 205+ System.out.println( "page #" + node.getProperty("page_id") );
 206+ else
 207+ System.out.println( node.toString() );
81208 }
82209 }
83210
84 - public static void main(String[] args) throws IOException, SQLException {
85 - GraphDatabaseService graphDb = new EmbeddedGraphDatabase( args[0] );
86 - DatabaseConnectionInfo dbInfo = new DatabaseConnectionInfo( new File(args[1]) );
87 - String sql = IOUtil.slurp(new File(args[2]), "UTF-8");
 211+ /*
 212+ protected static final String catlinksSql = "select cl_from, page_id from categorylinks join page on cl_to = page_title and page_namespace = 14";
 213+ protected static final String rootCatsSql = "select page_id from page left join categorylinks on cl_from = page_id where cl_from is null and page_namespace = 14";
 214+ */
 215+
 216+ public void listDescendants(int start, Output out) {
 217+ out.println("finding descendants of "+start+"....");
 218+ long t = System.currentTimeMillis();
 219+ Collection<Integer> descendants = getDescendants(start);
 220+ out.println("finding descendants of "+start+" took "+(System.currentTimeMillis() - t)+"ms.");
 221+
 222+ out.println("-----------------------------");
 223+ DebugUtil.dump("", descendants, out);
 224+ out.println("-----------------------------");
 225+ }
 226+
 227+ protected Command newCommand(String cmd, List<Object> args) {
 228+ Command command = super.newCommand(cmd, args);
 229+ if (command!=null) return command;
88230
 231+ if (cmd.equals("descendants") || cmd.equals("desc") || cmd.equals("d")) return new Descendants(DatabaseUtil.asInt(args.get(0)));
 232+ else return null;
 233+ }
 234+
 235+ public static void main(String[] argv) throws IOException, SQLException, PersistenceException {
 236+ Arguments args = new Arguments();
 237+ args.parse(argv);
 238+
 239+ Map<String,String> configuration = null;
 240+
 241+ if ( args.isSet("config") ) {
 242+ configuration = EmbeddedGraphDatabase.loadConfigurations( args.getOption("config", (String)null) );
 243+ } else {
 244+ URL u = CatGraph.class.getResource("neo4j.properties");
 245+ configuration = CollectionUtils.asMap( SystemUtils.loadProperties(u, null) );
 246+ }
 247+
 248+ GraphDatabaseService graphDb = new EmbeddedGraphDatabase( args.getParameter(0), configuration );
 249+ File tsv = new File(args.getParameter(1));
 250+
89251 IndexService indexer = new LuceneIndexService(graphDb);
 252+
 253+ /*
 254+ DatabaseAccess db = new DatabaseSchema(null, dbInfo, null);
 255+ db.open();
90256
91 - DatabaseAccess db = new DatabaseSchema(null, dbInfo, null);
92 -
 257+ db.executeUpdate("", "use "+database+";");
 258+ */
93259 CatGraph graph = new CatGraph(graphDb, indexer);
94260
 261+ InputStreamReader rd = new InputStreamReader(new FileInputStream(tsv));
 262+ ChunkingCursor cursor = new ChunkingCursor(new LineCursor(rd), CsvLineChunker.tsv);
 263+
 264+ cursor.next(); //skip header in first line
 265+
95266 Transaction tx = graphDb.beginTx();
96267 try
97268 {
98 -
99 - graph.load(db, sql);
100 -
101 - graph.traverseAndDump(graphDb.getReferenceNode());
102 -
103 - tx.success();
 269+ System.out.println("loading arcs....");
 270+ long t = System.currentTimeMillis();
 271+ graph.loadArcs(cursor, 0, 1);
 272+ System.out.println("loading arcs took "+(System.currentTimeMillis() - t)+"ms.");
 273+
 274+ graph.run();
104275 }
105276 finally
106277 {
107278 tx.finish();
 279+ graphDb.shutdown();
108280 }
109281
110282 System.out.println( "done" );
Index: trunk/WikiWord/CatGraph/pom.xml
@@ -51,11 +51,6 @@
5252 </dependency>
5353 <dependency>
5454 <groupId>org.neo4j</groupId>
55 - <artifactId>neo4j-lucene-index</artifactId>
56 - <version>1.0</version>
57 - </dependency>
58 - <dependency>
59 - <groupId>org.neo4j</groupId>
6055 <artifactId>neo4j-shell</artifactId>
6156 <version>1.0</version>
6257 </dependency>
@@ -64,21 +59,6 @@
6560 <artifactId>neo4j-utils</artifactId>
6661 <version>1.0</version>
6762 </dependency>
68 - <dependency>
69 - <groupId>org.neo4j</groupId>
70 - <artifactId>neo4j-rest</artifactId>
71 - <version>1.0</version>
72 - </dependency>
73 - <dependency>
74 - <groupId>org.neo4j</groupId>
75 - <artifactId>neo4j-traversal</artifactId>
76 - <version>1.0</version>
77 - </dependency>
78 - <dependency>
79 - <groupId>org.neo4j</groupId>
80 - <artifactId>neo4j-graph-algo</artifactId>
81 - <version>1.0</version>
82 - </dependency>
8363 </dependencies>
8464
8565 <build>

Status & tagging log