Initial version of CostBasedJoinPlanner for HW4

4eefbbc0 · Donald H. (Donnie) Pinkston, III · c7b4e8f9 · 4eefbbc0
Commit 4eefbbc0 authored 6 years ago by Donald H. (Donnie) Pinkston, III
Hide whitespace changes
Inline Side-by-side

Showing

with 426 additions and 0 deletions
+426 -0
--- a/src/main/java/edu/caltech/nanodb/queryeval/CostBasedJoinPlanner.java
+++ b/src/main/java/edu/caltech/nanodb/queryeval/CostBasedJoinPlanner.java
+package edu.caltech.nanodb.queryeval;
+
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import edu.caltech.nanodb.expressions.Expression;
+import edu.caltech.nanodb.plannodes.FileScanNode;
+import edu.caltech.nanodb.plannodes.PlanNode;
+import edu.caltech.nanodb.plannodes.SelectNode;
+import edu.caltech.nanodb.queryast.FromClause;
+import edu.caltech.nanodb.queryast.SelectClause;
+import edu.caltech.nanodb.relations.TableInfo;
+import edu.caltech.nanodb.storage.StorageManager;
+
+
+/**
+ * This planner implementation uses dynamic programming to devise an optimal
+ * join strategy for the query.  As always, queries are optimized in units of
+ * <tt>SELECT</tt>-<tt>FROM</tt>-<tt>WHERE</tt> subqueries; optimizations
+ * don't currently span multiple subqueries.
+ */
+public class CostBasedJoinPlanner implements Planner {
+
+    /** A logging object for reporting anything interesting that happens. */
+    private static Logger logger = LogManager.getLogger(
+        CostBasedJoinPlanner.class);
+
+
+    /** The storage manager used during query planning. */
+    protected StorageManager storageManager;
+
+
+    /** Sets the server to be used during query planning. */
+    public void setStorageManager(StorageManager storageManager) {
+        if (storageManager == null)
+            throw new IllegalArgumentException("storageManager cannot be null");
+
+        this.storageManager = storageManager;
+    }
+
+
+    /**
+     * This helper class is used to keep track of one "join component" in the
+     * dynamic programming algorithm.  A join component is simply a query plan
+     * for joining one or more leaves of the query.
+     * <p>
+     * In this context, a "leaf" may either be a base table or a subquery in
+     * the <tt>FROM</tt>-clause of the query.  However, the planner will
+     * attempt to push conjuncts down the plan as far as possible, so even if
+     * a leaf is a base table, the plan may be a bit more complex than just a
+     * single file-scan.
+     */
+    private static class JoinComponent {
+        /**
+         * This is the join plan itself, that joins together all leaves
+         * specified in the {@link #leavesUsed} field.
+         */
+        public PlanNode joinPlan;
+
+        /**
+         * This field specifies the collection of leaf-plans that are joined by
+         * the plan in this join-component.
+         */
+        public HashSet<PlanNode> leavesUsed;
+
+        /**
+         * This field specifies the collection of all conjuncts use by this join
+         * plan.  It allows us to easily determine what join conjuncts still
+         * remain to be incorporated into the query.
+         */
+        public HashSet<Expression> conjunctsUsed;
+
+        /**
+         * Constructs a new instance for a <em>leaf node</em>.  It should not
+         * be used for join-plans that join together two or more leaves.  This
+         * constructor simply adds the leaf-plan into the {@link #leavesUsed}
+         * collection.
+         *
+         * @param leafPlan the query plan for this leaf of the query.
+         *
+         * @param conjunctsUsed the set of conjuncts used by the leaf plan.
+         *        This may be an empty set if no conjuncts apply solely to
+         *        this leaf, or it may be nonempty if some conjuncts apply
+         *        solely to this leaf.
+         */
+        public JoinComponent(PlanNode leafPlan, HashSet<Expression> conjunctsUsed) {
+            leavesUsed = new HashSet<>();
+            leavesUsed.add(leafPlan);
+
+            joinPlan = leafPlan;
+
+            this.conjunctsUsed = conjunctsUsed;
+        }
+
+        /**
+         * Constructs a new instance for a <em>non-leaf node</em>.  It should
+         * not be used for leaf plans!
+         *
+         * @param joinPlan the query plan that joins together all leaves
+         *        specified in the <tt>leavesUsed</tt> argument.
+         *
+         * @param leavesUsed the set of two or more leaf plans that are joined
+         *        together by the join plan.
+         *
+         * @param conjunctsUsed the set of conjuncts used by the join plan.
+         *        Obviously, it is expected that all conjuncts specified here
+         *        can actually be evaluated against the join plan.
+         */
+        public JoinComponent(PlanNode joinPlan, HashSet<PlanNode> leavesUsed,
+                             HashSet<Expression> conjunctsUsed) {
+            this.joinPlan = joinPlan;
+            this.leavesUsed = leavesUsed;
+            this.conjunctsUsed = conjunctsUsed;
+        }
+    }
+
+
+    /**
+     * Returns the root of a plan tree suitable for executing the specified
+     * query.
+     *
+     * @param selClause an object describing the query to be performed
+     *
+     * @return a plan tree for executing the specified query
+     */
+    public PlanNode makePlan(SelectClause selClause,
+        List<SelectClause> enclosingSelects) {
+
+        // TODO:  Implement!
+        //
+        // This is a very rough sketch of how this function will work,
+        // focusing mainly on join planning:
+        //
+        // 1)  Pull out the top-level conjuncts from the FROM and WHERE
+        //     clauses on the query, since we will handle them in special ways
+        //     if we have outer joins.
+        //
+        // 2)  Create an optimal join plan from the top-level from-clause and
+        //     the top-level conjuncts.
+        //
+        // 3)  If there are any unused conjuncts, determine how to handle them.
+        //
+        // 4)  Create a project plan-node if necessary.
+        //
+        // 5)  Handle other clauses such as ORDER BY, LIMIT/OFFSET, etc.
+        //
+        // Supporting other query features, such as grouping/aggregation,
+        // various kinds of subqueries, queries without a FROM clause, etc.,
+        // can all be incorporated into this sketch relatively easily.
+
+        return null;
+    }
+
+
+    /**
+     * Given the top-level {@code FromClause} for a SELECT-FROM-WHERE block,
+     * this helper generates an optimal join plan for the {@code FromClause}.
+     *
+     * @param fromClause the top-level {@code FromClause} of a
+     *        SELECT-FROM-WHERE block.
+     * @param extraConjuncts any extra conjuncts (e.g. from the WHERE clause,
+     *        or HAVING clause)
+     * @return a {@code JoinComponent} object that represents the optimal plan
+     *         corresponding to the FROM-clause
+     */
+    private JoinComponent makeJoinPlan(FromClause fromClause,
+        Collection<Expression> extraConjuncts) {
+
+        // These variables receive the leaf-clauses and join conjuncts found
+        // from scanning the sub-clauses.  Initially, we put the extra conjuncts
+        // into the collection of conjuncts.
+        HashSet<Expression> conjuncts = new HashSet<>();
+        ArrayList<FromClause> leafFromClauses = new ArrayList<>();
+
+        collectDetails(fromClause, conjuncts, leafFromClauses);
+
+        logger.debug("Making join-plan for " + fromClause);
+        logger.debug("    Collected conjuncts:  " + conjuncts);
+        logger.debug("    Collected FROM-clauses:  " + leafFromClauses);
+        logger.debug("    Extra conjuncts:  " + extraConjuncts);
+
+        if (extraConjuncts != null)
+            conjuncts.addAll(extraConjuncts);
+
+        // Make a read-only set of the input conjuncts, to avoid bugs due to
+        // unintended side-effects.
+        Set<Expression> roConjuncts = Collections.unmodifiableSet(conjuncts);
+
+        // Create a subplan for every single leaf FROM-clause, and prepare the
+        // leaf-plan.
+
+        logger.debug("Generating plans for all leaves");
+        ArrayList<JoinComponent> leafComponents = generateLeafJoinComponents(
+            leafFromClauses, roConjuncts);
+
+        // Print out the results, for debugging purposes.
+        if (logger.isDebugEnabled()) {
+            for (JoinComponent leaf : leafComponents) {
+                logger.debug("    Leaf plan:\n" +
+                    PlanNode.printNodeTreeToString(leaf.joinPlan, true));
+            }
+        }
+
+        // Build up the full query-plan using a dynamic programming approach.
+
+        JoinComponent optimalJoin =
+            generateOptimalJoin(leafComponents, roConjuncts);
+
+        PlanNode plan = optimalJoin.joinPlan;
+        logger.info("Optimal join plan generated:\n" +
+            PlanNode.printNodeTreeToString(plan, true));
+
+        return optimalJoin;
+    }
+
+
+    /**
+     * This helper method pulls the essential details for join optimization
+     * out of a <tt>FROM</tt> clause.
+     *
+     * TODO:  FILL IN DETAILS.
+     *
+     * @param fromClause the from-clause to collect details from
+     *
+     * @param conjuncts the collection to add all conjuncts to
+     *
+     * @param leafFromClauses the collection to add all leaf from-clauses to
+     */
+    private void collectDetails(FromClause fromClause,
+        HashSet<Expression> conjuncts, ArrayList<FromClause> leafFromClauses) {
+
+        // TODO:  IMPLEMENT
+    }
+
+
+    /**
+     * This helper method performs the first step of the dynamic programming
+     * process to generate an optimal join plan, by generating a plan for every
+     * leaf from-clause identified from analyzing the query.  Leaf plans are
+     * usually very simple; they are built either from base-tables or
+     * <tt>SELECT</tt> subqueries.  The most complex detail is that any
+     * conjuncts in the query that can be evaluated solely against a particular
+     * leaf plan-node will be associated with the plan node.  <em>This is a
+     * heuristic</em> that usually produces good plans (and certainly will for
+     * the current state of the database), but could easily interfere with
+     * indexes or other plan optimizations.
+     *
+     * @param leafFromClauses the collection of from-clauses found in the query
+     *
+     * @param conjuncts the collection of conjuncts that can be applied at this
+     *                  level
+     *
+     * @return a collection of {@link JoinComponent} object containing the plans
+     *         and other details for each leaf from-clause
+     */
+    private ArrayList<JoinComponent> generateLeafJoinComponents(
+        Collection<FromClause> leafFromClauses, Collection<Expression> conjuncts) {
+
+        // Create a subplan for every single leaf FROM-clause, and prepare the
+        // leaf-plan.
+        ArrayList<JoinComponent> leafComponents = new ArrayList<>();
+        for (FromClause leafClause : leafFromClauses) {
+            HashSet<Expression> leafConjuncts = new HashSet<>();
+
+            PlanNode leafPlan =
+                makeLeafPlan(leafClause, conjuncts, leafConjuncts);
+
+            JoinComponent leaf = new JoinComponent(leafPlan, leafConjuncts);
+            leafComponents.add(leaf);
+        }
+
+        return leafComponents;
+    }
+
+
+    /**
+     * Constructs a plan tree for evaluating the specified from-clause.
+     * TODO:  COMPLETE THE DOCUMENTATION
+     *
+     * @param fromClause the select nodes that need to be joined.
+     *
+     * @param conjuncts additional conjuncts that can be applied when
+     *        constructing the from-clause plan.
+     *
+     * @param leafConjuncts this is an output-parameter.  Any conjuncts
+     *        applied in this plan from the <tt>conjuncts</tt> collection
+     *        should be added to this out-param.
+     *
+     * @return a plan tree for evaluating the specified from-clause
+     *
+     * @throws IllegalArgumentException if the specified from-clause is a join
+     *         expression that isn't an outer join, or has some other
+     *         unrecognized type.
+     */
+    private PlanNode makeLeafPlan(FromClause fromClause,
+        Collection<Expression> conjuncts, HashSet<Expression> leafConjuncts) {
+
+        // TODO:  IMPLEMENT.
+        //        If you apply any conjuncts then make sure to add them to the
+        //        leafConjuncts collection.
+        //
+        //        Don't forget that all from-clauses can specify an alias.
+        //
+        //        Concentrate on properly handling cases other than outer
+        //        joins first, then focus on outer joins once you have the
+        //        typical cases supported.
+
+        return null;
+    }
+
+
+    /**
+     * This helper method builds up a full join-plan using a dynamic programming
+     * approach.  The implementation maintains a collection of optimal
+     * intermediate plans that join <em>n</em> of the leaf nodes, each with its
+     * own associated cost, and then uses that collection to generate a new
+     * collection of optimal intermediate plans that join <em>n+1</em> of the
+     * leaf nodes.  This process completes when all leaf plans are joined
+     * together; there will be <em>one</em> plan, and it will be the optimal
+     * join plan (as far as our limited estimates can determine, anyway).
+     *
+     * @param leafComponents the collection of leaf join-components, generated
+     *        by the {@link #generateLeafJoinComponents} method.
+     *
+     * @param conjuncts the collection of all conjuncts found in the query
+     *
+     * @return a single {@link JoinComponent} object that joins all leaf
+     *         components together in an optimal way.
+     */
+    private JoinComponent generateOptimalJoin(
+        ArrayList<JoinComponent> leafComponents, Set<Expression> conjuncts) {
+
+        // This object maps a collection of leaf-plans (represented as a
+        // hash-set) to the optimal join-plan for that collection of leaf plans.
+        //
+        // This collection starts out only containing the leaf plans themselves,
+        // and on each iteration of the loop below, join-plans are grown by one
+        // leaf.  For example:
+        //   * In the first iteration, all plans joining 2 leaves are created.
+        //   * In the second iteration, all plans joining 3 leaves are created.
+        //   * etc.
+        // At the end, the collection will contain ONE entry, which is the
+        // optimal way to join all N leaves.  Go Go Gadget Dynamic Programming!
+        HashMap<HashSet<PlanNode>, JoinComponent> joinPlans = new HashMap<>();
+
+        // Initially populate joinPlans with just the N leaf plans.
+        for (JoinComponent leaf : leafComponents)
+            joinPlans.put(leaf.leavesUsed, leaf);
+
+        while (joinPlans.size() > 1) {
+            logger.debug("Current set of join-plans has " + joinPlans.size() +
+                " plans in it.");
+
+            // This is the set of "next plans" we will generate.  Plans only
+            // get stored if they are the first plan that joins together the
+            // specified leaves, or if they are better than the current plan.
+            HashMap<HashSet<PlanNode>, JoinComponent> nextJoinPlans =
+                new HashMap<>();
+
+            // TODO:  IMPLEMENT THE CODE THAT GENERATES OPTIMAL PLANS THAT
+            //        JOIN N + 1 LEAVES
+
+            // Now that we have generated all plans joining N leaves, time to
+            // create all plans joining N + 1 leaves.
+            joinPlans = nextJoinPlans;
+        }
+
+        // At this point, the set of join plans should only contain one plan,
+        // and it should be the optimal plan.
+
+        assert joinPlans.size() == 1 : "There can be only one optimal join plan!";
+        return joinPlans.values().iterator().next();
+    }
+
+
+    /**
+     * Constructs a simple select plan that reads directly from a table, with
+     * an optional predicate for selecting rows.
+     * <p>
+     * While this method can be used for building up larger <tt>SELECT</tt>
+     * queries, the returned plan is also suitable for use in <tt>UPDATE</tt>
+     * and <tt>DELETE</tt> command evaluation.  In these cases, the plan must
+     * only generate tuples of type {@link edu.caltech.nanodb.storage.PageTuple},
+     * so that the command can modify or delete the actual tuple in the file's
+     * page data.
+     *
+     * @param tableName The name of the table that is being selected from.
+     *
+     * @param predicate An optional selection predicate, or {@code null} if
+     *        no filtering is desired.
+     *
+     * @return A new plan-node for evaluating the select operation.
+     */
+    public SelectNode makeSimpleSelect(String tableName, Expression predicate,
+        List<SelectClause> enclosingSelects) {
+        if (tableName == null)
+            throw new IllegalArgumentException("tableName cannot be null");
+
+        if (enclosingSelects != null) {
+            // If there are enclosing selects, this subquery's predicate may
+            // reference an outer query's value, but we don't detect that here.
+            // Therefore we will probably fail with an unrecognized column
+            // reference.
+            logger.warn("Currently we are not clever enough to detect " +
+                "correlated subqueries, so expect things are about to break...");
+        }
+
+        // Open the table.
+        TableInfo tableInfo = storageManager.getTableManager().openTable(tableName);
+
+        // Make a SelectNode to read rows from the table, with the specified
+        // predicate.
+        SelectNode selectNode = new FileScanNode(tableInfo, predicate);
+        selectNode.prepare();
+        return selectNode;
+    }
+}