diff --git a/doc/lab3design.txt b/doc/lab3design.txt new file mode 100644 index 0000000000000000000000000000000000000000..b547fc2e133dca808fd45913b1411a5db5fe3c61 --- /dev/null +++ b/doc/lab3design.txt @@ -0,0 +1,138 @@ +CS122 Assignment 3 - Table Statistics and Plan Costing - Design Document +======================================================================== + +Fill in answers for all questions based on your team's work on Assignment 3. + +A: Statistics Collection +------------------------- + +A1. Using pseudocode, summarize the implementation of your HeapTupleFile + analyze() function. Please summarize your actual code; do not simply + paste in the assignment description or the actual code you wrote. + +B: Plan Costing Estimates +---------------------------------------- + +B1. Briefly describe how you estimate the number of tuples and the cost + of a file-scan plan node. What factors does your cost include? + +B2. Briefly describe how you estimate the number of tuples and the cost + of a simple-filter plan node. What factors does your cost include? + +B3. Briefly describe how you estimate the number of tuples and the cost + of a nested-loop join plan node. What factors does your cost include? + +B4. For each kind of comparison (==, !=, >, <, >=, <=), how do you update the + estimated number of distinct values for each kind of comparison that your + StatisticsUpdater implementation supports? Are there cases where you make + no changes to the statistics? + +B5. For each kind of comparison (==, !=, >, <, >=, <=), how do you update the + estimated min and max values for each kind of comparison that your + StatisticsUpdater implementation supports? Are there cases where you make + no changes to the statistics? + +C: Costing SQL Queries +----------------------- + +Answer these questions after you have loaded the stores-28K.sql data, and +have analyzed all of the tables in that schema. + +C1. Paste the output of running: EXPLAIN SELECT * FROM cities; + Do not include debug lines, just the output of the command itself. + +C2. What is the estimated number of tuples that will be produced by each + of these queries: + + SELECT * FROM cities WHERE population > 1000000; + + <paste output here> + + SELECT * FROM cities WHERE population > 5000000; + + <paste output here> + + SELECT * FROM cities WHERE population > 8000000; + + <paste output here> + + How many tuples does each query produce? + + Briefly explain the difference between the estimated number of tuples + and the actual number of tuples for these queries. + +C3. Paste the output of running these commands: + + EXPLAIN SELECT store_id FROM stores, cities + WHERE stores.city_id = cities.city_id AND + cities.population > 1000000; + + <paste output here> + + EXPLAIN SELECT store_id FROM stores JOIN + (SELECT city_id FROM cities + WHERE population > 1000000) AS big_cities + ON stores.city_id = big_cities.city_id; + + <paste output here> + + The estimated number of tuples produced should be the same, but the + costs should be different. Explain why. + +C4. The assignment gives this example "slow" query: + + SELECT store_id, property_costs + FROM stores, cities, states + WHERE stores.city_id = cities.city_id AND + cities.state_id = states.state_id AND + state_name = 'Oregon' AND property_costs > 500000; + + How long does this query take to run, in seconds? + + Include the EXPLAIN output for the above query here. + + <paste output here> + + How would you rewrite this query (e.g. using ON clauses, subqueries + in the FROM clause, etc.) to be as optimal as possible? Also include + the result of EXPLAINing your query. + +D: Extra Credit [OPTIONAL] +--------------------------- + +If you implemented any extra-credit tasks for this assignment, describe +them here. The description should be like this, with stuff in "<>" replaced. +(The value i starts at 1 and increments...) + +D<i>: <one-line description> + + <brief summary of what you did, including the specific classes that + we should look at for your implementation> + + <brief summary of test-cases that demonstrate/exercise your extra work> + +E: Feedback [OPTIONAL] +----------------------- + +WE NEED YOUR FEEDBACK! Thoughtful and constructive input will help us to +improve future versions of the course. These questions are OPTIONAL, and +your answers will not affect your grade in any way (including if you hate +everything about the assignment and databases in general, or Donnie and/or +the TAs in particular). Feel free to answer as many or as few of them as +you wish. + +E1. What parts of the assignment were most time-consuming? Why? + +E2. Did you find any parts of the assignment particularly instructive? + Correspondingly, did any parts feel like unnecessary busy-work? + +E3. Did you particularly enjoy any parts of the assignment? Were there + any parts that you particularly disliked? + +E4. Were there any critical details that you wish had been provided with the + assignment, that we should consider including in subsequent versions of + the assignment? + +E5. Do you have any other suggestions for how future versions of the + assignment can be improved? + diff --git a/doc/lab3info.txt b/doc/lab3info.txt new file mode 100644 index 0000000000000000000000000000000000000000..f70539975c35bc6f7679405a5d5b7a66e331e725 --- /dev/null +++ b/doc/lab3info.txt @@ -0,0 +1,33 @@ +CS122 Assignment 3 - Table Statistics and Plan Costing +====================================================== + +Please completely fill out this document so that we know who participated on +the assignment, any late extensions received, and how much time the assignment +took for your team. Thank you! + +L1. List your team name and the people who worked on this assignment. + + <team name> + + <name> + <name> + ... + +L2. Specify the tag and commit-hash of the Git commit you are submitting for + your assignment. (You can list the hashes of all tags with the command + "git show-ref --tags".) + + Tag: <tag> + Commit hash: <hash> + +L3. Specify how many late tokens you are applying to this assignment, if any. + Similarly, if your team received an extension from Donnie then please + indicate how many days extension you received. You may leave this blank + if it is not relevant to this submission. + + <tokens / extension> + +L4. For each teammate, briefly describe what parts of the assignment each + teammate focused on, along with the total hours spent on the assignment. + + diff --git a/src/main/java/edu/caltech/nanodb/expressions/ArithmeticOperator.java b/src/main/java/edu/caltech/nanodb/expressions/ArithmeticOperator.java index 6457b47ed847ff60ba21c802852230e4a3ca1b28..68e224a2131aa131e6e3599f76524364862aff2c 100755 --- a/src/main/java/edu/caltech/nanodb/expressions/ArithmeticOperator.java +++ b/src/main/java/edu/caltech/nanodb/expressions/ArithmeticOperator.java @@ -770,6 +770,7 @@ public class ArithmeticOperator extends Expression { * Simplifies an arithmetic expression, computing as much of the expression * as possible. */ + @Override public Expression simplify() { leftExpr = leftExpr.simplify(); rightExpr = rightExpr.simplify(); diff --git a/src/main/java/edu/caltech/nanodb/expressions/BooleanOperator.java b/src/main/java/edu/caltech/nanodb/expressions/BooleanOperator.java index 0b28fcd74dd9a603cc33a5fd2b223f9184d1e6a2..68edee97a017d4460185f34732b07d597ac56882 100755 --- a/src/main/java/edu/caltech/nanodb/expressions/BooleanOperator.java +++ b/src/main/java/edu/caltech/nanodb/expressions/BooleanOperator.java @@ -435,6 +435,70 @@ public class BooleanOperator extends Expression { } + /** + * Simplifies a Boolean expression by eliminating and de-nesting as much of + * the expression as possible. + */ + @Override + public Expression simplify() { + // Go through and try to simplify anything we can inside the operator + for (int i = 0; i < terms.size(); i++) { + Expression e = terms.get(i); + terms.set(i, e.simplify()); + } + + // There is only one simplification for the NOT expression: + // NOT (NOT P) = P + if (type == Type.NOT_EXPR) { + assert terms.size() == 1; + if (terms.get(0) instanceof BooleanOperator) { + BooleanOperator nested = (BooleanOperator) terms.get(0); + if (nested.type == Type.NOT_EXPR) { + assert nested.terms.size() == 1; + return nested.terms.get(0); + } + } + + // If we fall through to here, we can't do any simplification of + // the NOT expression. + return this; + } + + // Handle AND and OR expression simplifications. + assert type == Type.AND_EXPR || type == Type.OR_EXPR; + + // If there is only one term, just return the term. + if (terms.size() == 1) + return terms.get(0); + + int i = 0; + while (i < terms.size()) { + Expression e = terms.get(i); + if (e instanceof BooleanOperator) { + BooleanOperator b = (BooleanOperator) e; + // If the nested Boolean operator is the same type as this + // one, we can lift up the terms and put them in this + // operator. + if (b.type == type) { + terms.remove(i); + terms.addAll(i, b.terms); + } + else { + i++; + } + + // The way this loop is constructed, we will repeat this + // check/lift process for any child nodes we just lifted up, + // that are also BooleanOperators of the same type, since + // the new nodes will now be at index i. + } + } + + + return this; + } + + /** * Performs a value-eqality test for whether the specified object is an * expression with the same structure and contents. diff --git a/src/main/java/edu/caltech/nanodb/expressions/PredicateUtils.java b/src/main/java/edu/caltech/nanodb/expressions/PredicateUtils.java index 45ce42fb57958279a35f89525628970a4ffa4fcd..96273d5b01918fdad973af955b73ba9c5ad30be4 100644 --- a/src/main/java/edu/caltech/nanodb/expressions/PredicateUtils.java +++ b/src/main/java/edu/caltech/nanodb/expressions/PredicateUtils.java @@ -51,7 +51,7 @@ public class PredicateUtils { public static Expression makePredicate(Expression... conjuncts) { - ArrayList<Expression> list = new ArrayList<Expression>(); + ArrayList<Expression> list = new ArrayList<>(); for (Expression conjunct : conjuncts) list.add(conjunct); diff --git a/src/main/java/edu/caltech/nanodb/plannodes/FileScanNode.java b/src/main/java/edu/caltech/nanodb/plannodes/FileScanNode.java index 969c7d358467592ceb6d51c17687baa04af1ba48..9be881629a932d325ca73d997c39db68918010bb 100644 --- a/src/main/java/edu/caltech/nanodb/plannodes/FileScanNode.java +++ b/src/main/java/edu/caltech/nanodb/plannodes/FileScanNode.java @@ -223,15 +223,13 @@ public class FileScanNode extends SelectNode { // Grab the schema and statistics from the table file. schema = tupleFile.getSchema(); - TableStats tableStats = tupleFile.getStats(); - stats = tableStats.getAllColumnStats(); // TODO: Compute the cost of the plan node! cost = null; // TODO: Update the statistics based on the predicate. - + stats = tableStats.getAllColumnStats(); } diff --git a/src/main/java/edu/caltech/nanodb/queryast/SelectClause.java b/src/main/java/edu/caltech/nanodb/queryast/SelectClause.java index 20b774cd0ab1dc3be610e0158f2e8755bf98cb31..18de7086ec00eefa40d8a168e1181b09d2772b8a 100755 --- a/src/main/java/edu/caltech/nanodb/queryast/SelectClause.java +++ b/src/main/java/edu/caltech/nanodb/queryast/SelectClause.java @@ -14,6 +14,7 @@ import org.apache.logging.log4j.LogManager; import edu.caltech.nanodb.expressions.ColumnName; import edu.caltech.nanodb.expressions.Expression; import edu.caltech.nanodb.expressions.OrderByExpression; +import edu.caltech.nanodb.queryeval.InvalidSQLException; import edu.caltech.nanodb.relations.ColumnInfo; import edu.caltech.nanodb.relations.Schema; import edu.caltech.nanodb.relations.SchemaNameException; @@ -285,19 +286,38 @@ public class SelectClause { } + /** + * If a <tt>LIMIT</tt> clause is specified, this method returns the + * specified limit; otherwise, the default of 0 is returned. + * + * @return the offset specified in the SQL + */ public int getLimit() { return limit; } + /** + * Set the upper limit of how many rows should be produced by this query. + * A value of 0 means "unlimited." Negative values are disallowed and + * will cause an exception to be thrown. + * + * @param limit a positive number specifying the maximum number of tuples + * to produce, or 0 to specify "unlimited." + */ public void setLimit(int limit) { + if (limit < 0) { + throw new InvalidSQLException("Limit must be at least 0 (got " + + limit + ")"); + } + this.limit = limit; } /** * If an <tt>OFFSET</tt> clause is specified, this method returns the - * specified offset; otherwise, 0 is returned. + * specified offset; otherwise, the default of 0 is returned. * * @return the offset specified in the SQL */ @@ -306,7 +326,22 @@ public class SelectClause { } + /** + * Set the starting offset for the rows that should be produced by this + * query. A value of 0 means "start with the first row" (in other words, + * "no offset"). Negative values are disallowed and will cause an + * exception to be thrown. + * + * @param offset a positive number specifying the number of tuples to skip + * during query evaluation, or 0 to specify "start at the + * beginning." + */ public void setOffset(int offset) { + if (offset < 0) { + throw new InvalidSQLException("Offset must be at least 0 (got " + + offset + ")"); + } + this.offset = offset; } @@ -378,8 +413,9 @@ public class SelectClause { ColumnName colName = selVal.getWildcard(); if (colName.isTableSpecified()) { if (!fromTables.contains(colName.getTableName())) { - throw new SchemaNameException("SELECT-value " + colName + - " specifies an unrecognized table name."); + throw new SchemaNameException(String.format( + "SELECT-value %s specifies an unrecognized " + + "table name.", colName)); } } } @@ -404,7 +440,8 @@ public class SelectClause { resultColumnInfos.addAll(selVal.getColumnInfos(fromSchema, resultSchema)); } - // Construct a resultSchema which is the "summation" of all SelectValues' columnInfos. + // Construct a resultSchema which is the "summation" of all + // SelectValues' columnInfos. resultSchema = new Schema(resultColumnInfos); logger.debug("Query schema: " + resultSchema); @@ -436,11 +473,12 @@ public class SelectClause { havingExpr.traverse(subquerySchemaComputer); } - // ORDER BY clauses: + // ORDER BY clauses: These are computed from the result of the + // SELECT clause, not the result of the FROM clause. for (OrderByExpression expr : orderByExprs) { // ORDER BY expressions aren't allowed to have subqueries. resolveExpressionRefs("ORDER BY clause", expr.getExpression(), - fromSchema, /* checkParentQueries */ false); + resultSchema, /* checkParentQueries */ false); } // All done! Return the computed schema. diff --git a/src/main/java/edu/caltech/nanodb/queryast/SelectValue.java b/src/main/java/edu/caltech/nanodb/queryast/SelectValue.java index 820241d6d6720310f13e292ef6661c33e44f85a9..89a3555cf39122f01d18947f28edb4d6cfb925d5 100755 --- a/src/main/java/edu/caltech/nanodb/queryast/SelectValue.java +++ b/src/main/java/edu/caltech/nanodb/queryast/SelectValue.java @@ -187,6 +187,17 @@ public class SelectValue implements Cloneable { } + /** + * Sets this SelectValue's alias result name to the specifie string. + * + * @param alias the alias to use for the column, or {@code null} if no + * alias should be used + */ + public void setAlias(String alias) { + resultAlias = alias; + } + + /** * Returns the wildcard {@link ColumnName} object or <tt>null</tt> if there * is none. diff --git a/src/main/java/edu/caltech/nanodb/queryeval/ColumnStats.java b/src/main/java/edu/caltech/nanodb/queryeval/ColumnStats.java index 0790d344f700b501f1ba611128e68a3cec45059a..93267254129198bbb8a4bbdd9467281e2069a806 100644 --- a/src/main/java/edu/caltech/nanodb/queryeval/ColumnStats.java +++ b/src/main/java/edu/caltech/nanodb/queryeval/ColumnStats.java @@ -43,15 +43,6 @@ public class ColumnStats { private Object maxValue; - /** Initializes a column-stats object to all "unknown" values. */ - public ColumnStats() { - numUniqueValues = -1; - numNullValues = -1; - minValue = null; - maxValue = null; - } - - /** * Initializes a column-stats object with the specified values. * @@ -73,6 +64,19 @@ public class ColumnStats { } + /** Initializes a column-stats object to all "unknown" values. */ + public ColumnStats() { + this(-1, -1, null, null); + } + + + /** Copies another column-stats object into this object. */ + public ColumnStats(ColumnStats stats) { + this(stats.numUniqueValues, stats.numNullValues, + stats.minValue, stats.maxValue); + } + + /** * Returns the number of unique values for the column, or -1 if the number * is unknown diff --git a/src/main/java/edu/caltech/nanodb/queryeval/SelectivityEstimator.java b/src/main/java/edu/caltech/nanodb/queryeval/SelectivityEstimator.java index 28035d3590298fe72c5c68898749827b742efbfa..28e6ed270f9fa896d18c79211d85354a55d5a922 100644 --- a/src/main/java/edu/caltech/nanodb/queryeval/SelectivityEstimator.java +++ b/src/main/java/edu/caltech/nanodb/queryeval/SelectivityEstimator.java @@ -389,8 +389,8 @@ public class SelectivityEstimator { * (<em>high</em><sub>2</sub> - <em>low</em><sub>2</sub>), clamped * to the range [0, 1]. */ - private static float computeRatio(Object low1, Object high1, - Object low2, Object high2) { + public static float computeRatio(Object low1, Object high1, + Object low2, Object high2) { Object diff1 = ArithmeticOperator.evalObjects( ArithmeticOperator.Type.SUBTRACT, high1, low1); @@ -398,9 +398,13 @@ public class SelectivityEstimator { Object diff2 = ArithmeticOperator.evalObjects( ArithmeticOperator.Type.SUBTRACT, high2, low2); + diff1 = TypeConverter.getFloatValue(diff1); + diff2 = TypeConverter.getFloatValue(diff2); + Object ratio = ArithmeticOperator.evalObjects( ArithmeticOperator.Type.DIVIDE, diff1, diff2); + // This should already be a float, but just in case... float fltRatio = TypeConverter.getFloatValue(ratio); logger.debug(String.format("Ratio: (%s - %s) / (%s - %s) = %.2f", diff --git a/src/main/java/edu/caltech/nanodb/queryeval/StatisticsUpdater.java b/src/main/java/edu/caltech/nanodb/queryeval/StatisticsUpdater.java new file mode 100644 index 0000000000000000000000000000000000000000..2e55bf16752118224ccecd386f28b76a49abeeed --- /dev/null +++ b/src/main/java/edu/caltech/nanodb/queryeval/StatisticsUpdater.java @@ -0,0 +1,169 @@ +package edu.caltech.nanodb.queryeval; + + +import java.util.ArrayList; +import java.util.List; + +import edu.caltech.nanodb.expressions.BooleanOperator; +import edu.caltech.nanodb.expressions.ColumnName; +import edu.caltech.nanodb.expressions.ColumnValue; +import edu.caltech.nanodb.expressions.CompareOperator; +import edu.caltech.nanodb.expressions.Expression; +import edu.caltech.nanodb.expressions.LiteralValue; +import edu.caltech.nanodb.expressions.TypeConverter; +import edu.caltech.nanodb.relations.ColumnInfo; +import edu.caltech.nanodb.relations.Schema; + + +/** + * This helper class provides basic functionality for updating column + * statistics based on a selection predicate. The supported predicates are + * very limited, because the problem becomes very grungy for arbitrary + * predicates. Supported predicates include: + * <ul> + * <li>A single comparison between a column and a value</li> + * <li> + * A Boolean <tt>AND</tt> expression, although only the column-value + * comparisons within the expression will be used to update statistics + * </li> + * </ul> + */ +public class StatisticsUpdater { + /** This class should not be instantiated. */ + private StatisticsUpdater() { + throw new IllegalArgumentException( + "This class should not be instantiated."); + } + + + /** + * <p> + * This static helper takes a selection predicate, a schema the predicate + * is evaluated against, and input column-statistics, and produces output + * column-statistics that reflect the input arguments. The output is a + * deep-copy of the input, so that no accidental side-effects occur. + * </p> + * <p> + * Only a very limited number of selection predicates are supported, all + * centering around conjunctive selection based on "COLUMN op VALUE" + * components. Other kinds of predicates will not be used to update the + * statistics. + * </p> + * <p> + * If a predicate includes a column-reference that is not part of the + * input schema, then that part of the predicate will be ignored. + * </p> + * + * @param expr the selection predicate + * @param schema the schema that the selection predicate is evaluated + * against + * @param inputStats the column statistics of the input tuple-sequence + * that will be filtered by the selection predicate + * + * @return estimated column statistics + */ + public static ArrayList<ColumnStats> updateStats(Expression expr, + Schema schema, List<ColumnStats> inputStats) { + // Make a deep copy of the incoming list so we can mutate it safely. + ArrayList<ColumnStats> outputStats = new ArrayList<>(); + for (ColumnStats stat : inputStats) + outputStats.add(new ColumnStats(stat)); + + if (expr instanceof BooleanOperator) { + // The predicate includes a Boolean operation. If it's an AND + // operation (conjunctive selection) then look at each conjunct + // in sequence. + BooleanOperator boolOp = (BooleanOperator) expr; + if (boolOp.getType() == BooleanOperator.Type.AND_EXPR) { + for (int i = 0; i < boolOp.getNumTerms(); i++) { + Expression e = boolOp.getTerm(i); + if (e instanceof CompareOperator) { + // This conjunct appears to be a comparison. Unpack + // it and try to update the statistics based on the + // comparison. + updateCompareStats((CompareOperator) e, schema, + outputStats); + } + } + } + } + else if (expr instanceof CompareOperator) { + // The predicate appears to be a comparison. Unpack it and try to + // update the statistics based on the comparison. + updateCompareStats((CompareOperator) expr, schema, outputStats); + } + return outputStats; + } + + + /** + * Try to update the column-statistics based on the passed-in comparison + * operation. Updates will only occur if the comparison is of the form + * "COLUMN = VALUE". + * + * @param comp The comparison operation to consider + * @param schema The schema that the operation is evaluated against + * @param stats the statistics to update based on the comparison + */ + private static void updateCompareStats(CompareOperator comp, + Schema schema, + List<ColumnStats> stats) { + // Move the comparison into a normalized order so that it's easier to + // write the logic for analysis. Specifically, this will ensure that + // if we are comparing a column and a value, the column will always be + // on the left and the value will always be on the right. + comp.normalize(); + + Expression left = comp.getLeftExpression(); + Expression right = comp.getRightExpression(); + + if (left instanceof ColumnValue && right instanceof LiteralValue) { + // Resolve the column name against the schema, so we can look up + // the corresponding statistics. If the column name is unknown, + // just return. + ColumnName colName = ((ColumnValue) left).getColumnName(); + int colIdx = schema.getColumnIndex(colName); + if (colIdx == -1) + return; + + // Get the column's type from the schema, so we can coerce the + // value in the comparison to the same type. + ColumnInfo colInfo = schema.getColumnInfo(colIdx); + Object value = right.evaluate(); + value = TypeConverter.coerceTo(value, colInfo.getType()); + + ColumnStats stat = stats.get(colIdx); + + /* TODO: IMPLEMENT THE REST! + * + * NOTE: In Java, you can switch on an enumerated type, but you + * do not specify the fully qualified name. Thus, you will end up + * with something like this: + * + * switch (comp.getType()) { + * case EQUALS: + * ... + * break; + * + * case NOT_EQUALS: + * ... + * break; + * + * ... // etc. + * } + * + * If you need to declare local variables within a switch-block, + * you can always declare a nested block, like this: + * + * case SOMECASE: { + * int i = ...; + * ... // etc. + * break; + * } + * + * You may find the SelectivityEstimator.computeRatio() function + * to be useful for some of the operations in this code. + */ + } + } +} diff --git a/src/main/java/edu/caltech/nanodb/relations/Schema.java b/src/main/java/edu/caltech/nanodb/relations/Schema.java index 696da63acb189317b69c821776012499ad95ef12..38d87272a2fdc2e0113ca08347461290457c5bca 100755 --- a/src/main/java/edu/caltech/nanodb/relations/Schema.java +++ b/src/main/java/edu/caltech/nanodb/relations/Schema.java @@ -158,9 +158,14 @@ public class Schema implements Serializable, Iterable<ColumnInfo> { /** + * <p> * Create a schema that is the concatenation of one or more other schemas. * Schemas are copied in the order they are given. If a column name * appears multiple times in the input, an exception will be generated. + * </p> + * <p> + * Keys will not be copied by this constructor. + * </p> * * @param schemas one or more schema objects to copy into this schema */ @@ -195,11 +200,24 @@ public class Schema implements Serializable, Iterable<ColumnInfo> { } + /** + * Returns an unmodifiable list of all the columns in the schema + * + * @return an unmodifiable list of all the columns in the schema + */ public List<ColumnInfo> getColumnInfos() { return Collections.unmodifiableList(columnInfos); } + /** + * Constructs and returns a list of {@link ColumnInfo} objects for the + * columns at the specified indexes. This method can be useful with + * {@link ColumnRefs} objects,to retrieve the columns referenced by a key. + * + * @param colIndexes an array of zero-based column indexes to retrieve + * @return a list of {@code ColumnInfo} objects for the specified columns + */ public ArrayList<ColumnInfo> getColumnInfos(int[] colIndexes) { ArrayList<ColumnInfo> result = new ArrayList<>(colIndexes.length); @@ -210,11 +228,26 @@ public class Schema implements Serializable, Iterable<ColumnInfo> { } + /** + * Provides support for iteration over the columns in the schema. + * + * @return an iterator over the columns in this schema. + */ + @Override public Iterator<ColumnInfo> iterator() { return Collections.unmodifiableList(columnInfos).iterator(); } + /** + * Add a column to the schema. + * + * @param colInfo the name and type of the column being added to the + * schema + * @return the zero-based index of the column in the schema + * + * @throws IllegalArgumentException if {@code colInfo} is {@code null} + */ public int addColumnInfo(ColumnInfo colInfo) { if (colInfo == null) throw new NullPointerException("colInfo cannot be null"); @@ -384,26 +417,26 @@ public class Schema implements Serializable, Iterable<ColumnInfo> { /** - * This method iterates through all columns in this schema and sets them all - * to be on the specified table. This method will throw an exception if the - * result would be an invalid schema with duplicate column names. + * This method iterates through all columns in the schema, setting them + * to all have the specified table name. An exception will be thrown if + * the result would be an invalid schema with duplicate column names; in + * this case, the schema object will remain unchanged. * - * @throws SchemaNameException if the schema contains columns with the same - * column name but different table names. In this case, resetting the - * table name will produce an invalid schema with ambiguous column - * names. + * @throws SchemaNameException if the schema contains columns with the + * same column name but different table names. In this case, + * resetting the table name will produce an invalid schema with + * ambiguous column names. * * @design (donnie) At present, this method does this by replacing each - * {@link edu.caltech.nanodb.relations.ColumnInfo} object with a new - * object with updated information. This is because - * <code>ColumnInfo</code> is currently immutable. + * {@link ColumnInfo} object with a new object with updated + * information. This is because {@code ColumnInfo} is immutable. */ public void setTableName(String tableName) throws SchemaNameException { - // First, verify that overriding the table names will not produce multiple - // ambiguous column names. + // First, verify that overriding the table names will not produce + // multiple ambiguous column names. ArrayList<String> duplicateNames = null; - for (Map.Entry<String, ArrayList<IndexedColumnInfo> > entry : + for (Map.Entry<String, ArrayList<IndexedColumnInfo>> entry : colsHashedByColumn.entrySet()) { if (entry.getValue().size() > 1) { @@ -442,7 +475,27 @@ public class Schema implements Serializable, Iterable<ColumnInfo> { } + /** + * Finds the index of the specified column in this schema, or returns -1 + * if the schema contains no column of the specified name. The + * column-name object is not required to specify a table name; however, if + * the table name is unspecified and the column name is ambiguous then an + * exception will be thrown. + * + * @param colName column-name object to use for looking up the column in + * the schema + * + * @return the zero-based index of the column, or -1 if the schema does + * not contain a column of the specified name. + * + * @throws IllegalArgumentException if {@code colName} is {@code null} + * @throws SchemaNameException if {@code colName} doesn't specify a table + * name, and multiple columns have the specified column name + */ public int getColumnIndex(ColumnName colName) { + if (colName == null) + throw new IllegalArgumentException("colInfo cannot be null"); + if (colName.isColumnWildcard()) throw new IllegalArgumentException("colName cannot be a wildcard"); @@ -450,17 +503,72 @@ public class Schema implements Serializable, Iterable<ColumnInfo> { } + /** + * Finds the index of the specified column in this schema, or returns -1 + * if the schema contains no column of the specified name. The + * column-info object is not required to specify a table name; however, if + * the table name is unspecified and the column name is ambiguous then an + * exception will be thrown. + * + * @param colInfo column-info object to use for looking up the column in + * the schema + * + * @return the zero-based index of the column, or -1 if the schema does + * not contain a column of the specified name. + * + * @throws IllegalArgumentException if {@code colInfo} is {@code null} + * @throws SchemaNameException if {@code colInfo} doesn't specify a table + * name, and multiple columns have the specified column name + */ public int getColumnIndex(ColumnInfo colInfo) { + if (colInfo == null) + throw new IllegalArgumentException("colInfo cannot be null"); + return getColumnIndex(colInfo.getTableName(), colInfo.getName()); } + /** + * Finds the index of the specified column in this schema, or returns -1 + * if the schema contains no column of the specified name. The table name + * is unspecified; if the column name is ambiguous then an exception will + * be thrown. + * + * @param colName the column name to look up + * + * @return the zero-based index of the column, or -1 if the schema does + * not contain a column of the specified name. + * + * @throws IllegalArgumentException if {@code colName} is {@code null} + * @throws SchemaNameException if multiple columns have the specified + * column name + */ public int getColumnIndex(String colName) { return getColumnIndex(null, colName); } + /** + * Finds the index of the specified column in this schema, or returns -1 + * if the schema contains no column of the specified name. The table name + * may be specified or it may be {@code null}; if {@code null} and the + * column name is ambiguous then an exception will be thrown. + * + * @param tblName the table name, or {@code null} if the table name is not + * known or unspecified + * @param colName the column name to look up + * + * @return the zero-based index of the column, or -1 if the schema does + * not contain a column of the specified name. + * + * @throws IllegalArgumentException if {@code colName} is {@code null} + * @throws SchemaNameException if {@code tblName} is {@code null} and + * multiple columns have the specified column name + */ public int getColumnIndex(String tblName, String colName) { + if (colName == null) + throw new IllegalArgumentException("colName cannot be null"); + ArrayList<IndexedColumnInfo> colList = colsHashedByColumn.get(colName); if (colList == null) @@ -671,6 +779,15 @@ public class Schema implements Serializable, Iterable<ColumnInfo> { } + /** + * Adds another candidate key to the schema. + * + * @param ck the candidate key to add to the schema. + * + * @throws IllegalArgumentException if {@code ck} is {@code null}, or if + * {@code ck} is a primary key and the schema already contains a + * primary key. + */ public void addCandidateKey(KeyColumnRefs ck) { if (ck == null) throw new IllegalArgumentException("ck cannot be null"); @@ -685,19 +802,30 @@ public class Schema implements Serializable, Iterable<ColumnInfo> { } + /** + * Returns a count of how many candidate keys are present on the schema. + * + * @return a count of how many candidate keys are present on the schema. + */ public int numCandidateKeys() { return candidateKeys.size(); } + /** + * Returns an unmodifiable list of candidate keys present on the schema. + * + * @return an unmodifiable list of candidate keys present on the schema. + */ public List<KeyColumnRefs> getCandidateKeys() { return Collections.unmodifiableList(candidateKeys); } /** - * This helper function returns {@code true} if this table has a candidate - * key on the set of columns specified in the argument. + * Returns {@code true} if this schema has a candidate key on the set of + * columns specified in the argument. The columns are specified by an + * array of zero-based indexes. * * @param colIndexes the set of columns to check against this table * to see if it's a candidate key @@ -710,11 +838,33 @@ public class Schema implements Serializable, Iterable<ColumnInfo> { } + /** + * Returns {@code true} if this schema has a candidate key on the set of + * columns specified in the argument. The columns are specified by a + * {@code ColumnRefs} object. + * + * @param colRefs the set of columns to check against this table to see if + * it's a candidate key + * + * @return {@code true} if this table has a candidate key on the + * specified columns; {@code false} otherwise + */ public boolean hasKeyOnColumns(ColumnRefs colRefs) { return hasKeyOnColumns(colRefs.getCols()); } + /** + * Returns any candidate-key from this schema that has the specified set + * of columns. Note that the key's columns may be in a different order + * than those specified in the argument. + * + * @param colIndexes the set of columns to check against this table + * to see if it's a candidate key + * + * @return a candidate key on the specified columns, or {@code null} + * if the schema contains no key on the specified columns + */ public KeyColumnRefs getKeyOnColumns(int[] colIndexes) { for (KeyColumnRefs ck : candidateKeys) if (ck.hasSameColumns(colIndexes)) @@ -724,11 +874,33 @@ public class Schema implements Serializable, Iterable<ColumnInfo> { } + /** + * Returns any candidate-key from this schema that has the specified set + * of columns. Note that the key's columns may be in a different order + * than those specified in the argument. + * + * @param colRefs the set of columns to check against this table to see if + * it's a candidate key + * + * @return a candidate key on the specified columns, or {@code null} + * if the schema contains no key on the specified columns + */ public KeyColumnRefs getKeyOnColumns(ColumnRefs colRefs) { return getKeyOnColumns(colRefs.getCols()); } + /** + * Returns all candidate-keys from this schema that have the specified set + * of columns. Note that keys may specify columns in a different order + * than those specified in the argument. If there are no keys on the + * specified columns, this method will return an empty list. + * + * @param colIndexes the set of columns to check against this table + * to see if it's a candidate key + * + * @return a list of candidate keys on the specified columns + */ public List<KeyColumnRefs> getAllKeysOnColumns(int[] colIndexes) { ArrayList<KeyColumnRefs> keys = new ArrayList<>(); @@ -741,6 +913,17 @@ public class Schema implements Serializable, Iterable<ColumnInfo> { } + /** + * Returns all candidate-keys from this schema that have the specified set + * of columns. Note that keys may specify columns in a different order + * than those specified in the argument. If there are no keys on the + * specified columns, this method will return an empty list. + * + * @param colRefs the set of columns to check against this table + * to see if it's a candidate key + * + * @return a list of candidate keys on the specified columns + */ public List<KeyColumnRefs> getAllKeysOnColumns(ColumnRefs colRefs) { return getAllKeysOnColumns(colRefs.getCols()); }