| /* |
| * Copyright 2017 WebAssembly Community Group participants |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // |
| // Local CSE |
| // |
| // This finds common sub-expressions and saves them to a local to avoid |
| // recomputing them. It runs in each basic block separately, and uses a simple |
| // algorithm, where we track "requests" to reuse a value. That is, if we see |
| // an add operation appear twice, and the inputs must be identical in both |
| // cases, then the second one requests to reuse the computed value from the |
| // first. The first one to appear is the "original" expression that will remain |
| // in the code; we will save its value to a local, and get it from that local |
| // later: |
| // |
| // (i32.add (A) (B)) |
| // (i32.add (A) (B)) |
| // |
| // => |
| // |
| // (local.tee $temp (i32.add (A) (B))) |
| // (local.get $temp) |
| // |
| // The algorithm used here is as follows: |
| // |
| // * Scan: Hash each expression and see if it repeats later. |
| // If it does: |
| // * Note that the original appearance is requested to be reused |
| // an additional time. |
| // * Link the first expression as the original appearance of the |
| // later one. |
| // * Scan the children of the repeat and undo their requests to be |
| // replaced (as if we will reuse the parent, we don't need to do |
| // anything for the children, see below). |
| // |
| // * Check: Check if effects prevent some of the requests from being done. For |
| // example, if the value contains a load from memory, we cannot |
| // optimize around a store, as the value before the store might be |
| // different (see below). |
| // |
| // * Apply: Go through the basic block again, this time adding a tee on an |
| // expression whose value we want to use later, and replacing the |
| // uses with gets. |
| // |
| // For example, here is what the algorithm would do on |
| // |
| // (i32.eqz (A)) |
| // .. |
| // (i32.eqz (A)) |
| // |
| // Assuming A does not have side effects that interfere, this will happen: |
| // |
| // 1. Scan A and add it to the hash map of things we have seen. |
| // 2. Scan the eqz, and do the same for it. |
| // 3. Scan the second A. Increment the first A's requests counter, and mark the |
| // second A as intended to be replaced by the original A. |
| // 4. Scan the second eqz, and do similar things for it: increment the requests |
| // for the original eqz, and point to the original from the repeat. |
| // * Then also scan its children, in this case A, and decrement the first |
| // A's reuse counter, and unmark the second A's note that it is intended |
| // to be replaced. |
| // 5. After that, the second eqz requests to be replaced by the first, and |
| // there is no request on A. |
| // 6. Run through the block again, adding a tee and replacing the second eqz, |
| // resulting in: |
| // |
| // (local.tee $temp |
| // (i32.eqz |
| // (A) |
| // ) |
| // ) |
| // (local.get $temp) |
| // |
| // Note how the scanning of children avoids us adding a local for A: when we |
| // reuse the parent, we don't need to also try to reuse the child. |
| // |
| // Effects must be considered carefully by the Checker phase. E.g.: |
| // |
| // x = load(a); |
| // store(..) |
| // y = load(a); |
| // |
| // Even though the load looks identical, the store means we may load a |
| // different value, so we will invalidate the request to optimize here. |
| // |
| // This pass only finds entire expression trees, and not parts of them, so we |
| // will not optimize this: |
| // |
| // (A (B (C (D1)))) |
| // (A (B (C (D2)))) |
| // |
| // The innermost child is different, so the trees are not identical. However, |
| // running flatten before running this pass would allow those to be optimized as |
| // well (we would also need to simplify locals somewhat to allow the locals to |
| // be identified as identical, see pass.cpp). |
| // |
| // TODO: Global, inter-block gvn etc. However, note that atm the cost of our |
| // adding new locals here is low because their lifetimes are all within a |
| // single basic block. A global optimization here could add long-lived |
| // locals with register allocation costs in the entire function. |
| // |
| |
| #include <algorithm> |
| #include <memory> |
| |
| #include <ir/cost.h> |
| #include <ir/effects.h> |
| #include <ir/iteration.h> |
| #include <ir/linear-execution.h> |
| #include <ir/properties.h> |
| #include <ir/type-updating.h> |
| #include <ir/utils.h> |
| #include <pass.h> |
| #include <wasm-builder.h> |
| #include <wasm-traversal.h> |
| #include <wasm.h> |
| |
| namespace wasm { |
| |
| namespace { |
| |
| // An expression with a cached hash value. |
| struct HashedExpression { |
| Expression* expr; |
| size_t digest; |
| |
| HashedExpression(Expression* expr, size_t digest) |
| : expr(expr), digest(digest) {} |
| |
| HashedExpression(const HashedExpression& other) |
| : expr(other.expr), digest(other.digest) {} |
| }; |
| |
| struct HEHasher { |
| size_t operator()(const HashedExpression hashed) const { |
| return hashed.digest; |
| } |
| }; |
| |
| // A full equality check for HashedExpressions. The hash is used as a speedup, |
| // but if it matches we still verify the contents are identical. |
| struct HEComparer { |
| bool operator()(const HashedExpression a, const HashedExpression b) const { |
| if (a.digest != b.digest) { |
| return false; |
| } |
| return ExpressionAnalyzer::equal(a.expr, b.expr); |
| } |
| }; |
| |
| // Maps hashed expressions to the list of expressions that match. That is, all |
| // expressions that are equivalent (same hash, and also compare equal) will |
| // be in a vector for the corresponding entry in this map. |
| using HashedExprs = std::unordered_map<HashedExpression, |
| SmallVector<Expression*, 1>, |
| HEHasher, |
| HEComparer>; |
| |
| // Request information about an expression: whether it requests to be replaced, |
| // or it is an original expression whose value can be used to replace others |
| // later. |
| struct RequestInfo { |
| // The number of other expressions that would like to reuse this value. |
| Index requests = 0; |
| |
| // If this is a repeat value, this points to the original. (And we have |
| // incremented the original's |requests|.) |
| Expression* original = nullptr; |
| |
| void validate() const { |
| // An expression cannot both be requested and make requests. If we see A |
| // appear three times, both repeats must request from the very first. |
| assert(!(requests && original)); |
| |
| // When we encounter a requestInfo (after its initial creation) then it |
| // must either request or be requested - otherwise, it should not exist, |
| // as we remove unneeded things from our tracking. |
| assert(requests || original); |
| } |
| }; |
| |
| // A map of expressions to their request info. |
| struct RequestInfoMap : public std::unordered_map<Expression*, RequestInfo> { |
| void dump(std::ostream& o) { |
| for (auto& [curr, info] : *this) { |
| o << *curr << " has " << info.requests << " reqs, orig: " << info.original |
| << '\n'; |
| } |
| } |
| }; |
| |
| struct Scanner |
| : public LinearExecutionWalker<Scanner, UnifiedExpressionVisitor<Scanner>> { |
| PassOptions& options; |
| |
| // Request info for all expressions ever seen. |
| RequestInfoMap& requestInfos; |
| |
| Scanner(PassOptions& options, RequestInfoMap& requestInfos) |
| : options(options), requestInfos(requestInfos) {} |
| |
| // Currently active hashed expressions in the current basic block. If we see |
| // an active expression before us that is identical to us, then it becomes our |
| // original expression that we request from. |
| HashedExprs activeExprs; |
| |
| // Stack of information of all active expressions. We store hash values and |
| // possibility (as computed by isPossible), which we compute incrementally so |
| // as to avoid N^2 work (which could happen if we recomputed children). |
| using HashPossibility = std::pair<size_t, bool>; |
| SmallVector<HashPossibility, 10> activeIncrementalInfo; |
| |
| static void doNoteNonLinear(Scanner* self, Expression** currp) { |
| // We are starting a new basic block. Forget all the currently-hashed |
| // expressions, as we no longer want to make connections to anything from |
| // another block. |
| self->activeExprs.clear(); |
| self->activeIncrementalInfo.clear(); |
| // Note that we do not clear requestInfos - that is information we will use |
| // later in the Applier class. That is, we've cleared all the active |
| // information, leaving the things we need later. |
| } |
| |
| // It is ok to look at adjacent blocks together, as if a later part of a block |
| // is not reached that is fine - changes we make there would not be reached in |
| // that case. |
| bool connectAdjacentBlocks = true; |
| |
| void visitExpression(Expression* curr) { |
| // Compute the hash, using the pre-computed hashes of the children, which |
| // are saved. This allows us to hash everything in linear time. |
| // |
| // Note that we must compute the hash first, as we need it even for things |
| // that are not isRelevant() (if they are the children of a relevant thing). |
| auto numChildren = Properties::getNumChildren(curr); |
| auto hash = ExpressionAnalyzer::shallowHash(curr); |
| auto possible = isPossible(curr); |
| for (Index i = 0; i < numChildren; i++) { |
| if (activeIncrementalInfo.empty()) { |
| // The child was in another block, so this expression cannot be |
| // optimized. |
| return; |
| } |
| auto [currHash, currPossible] = activeIncrementalInfo.back(); |
| activeIncrementalInfo.pop_back(); |
| hash_combine(hash, currHash); |
| if (!currPossible) { |
| possible = false; |
| } |
| } |
| activeIncrementalInfo.emplace_back(hash, possible); |
| |
| // Check if this is something possible and also relevant for optimization. |
| if (!possible || !isRelevant(curr)) { |
| return; |
| } |
| |
| auto& vec = activeExprs[HashedExpression(curr, hash)]; |
| vec.push_back(curr); |
| if (vec.size() > 1) { |
| // This is a repeat expression. Add a request for it. |
| auto& info = requestInfos[curr]; |
| auto* original = vec[0]; |
| info.original = original; |
| |
| // Mark the request on the original. Note that this may create the |
| // requestInfo for it, if it is the first request (this avoids us creating |
| // requests eagerly). |
| requestInfos[original].requests++; |
| |
| // Remove any requests from the expression's children, as we will replace |
| // the entire thing (see explanation earlier). Note that we just need to |
| // go over our direct children, as grandchildren etc. have already been |
| // processed. That is, if we have |
| // |
| // (A (B (C)) |
| // (A (B (C)) |
| // |
| // Then when we see the second B we will mark the second C as no longer |
| // requesting replacement. And then when we see the second A, all it needs |
| // to update is the second B. |
| for (auto* child : ChildIterator(curr)) { |
| if (!requestInfos.count(child)) { |
| // The child never had a request. While it repeated (since the parent |
| // repeats), it was not relevant for the optimization so we never |
| // created a requestInfo for it. |
| continue; |
| } |
| |
| // Remove the child. |
| auto& childInfo = requestInfos[child]; |
| auto* childOriginal = childInfo.original; |
| requestInfos.erase(child); |
| |
| // Update the child's original, potentially erasing it too if no |
| // requests remain. |
| assert(childOriginal); |
| auto& childOriginalRequests = requestInfos[childOriginal].requests; |
| assert(childOriginalRequests > 0); |
| childOriginalRequests--; |
| if (childOriginalRequests == 0) { |
| requestInfos.erase(childOriginal); |
| } |
| } |
| } |
| } |
| |
| // Only some values are relevant to be optimized. |
| bool isRelevant(Expression* curr) { |
| // * Ignore anything that is not a concrete type, as we are looking for |
| // computed values to reuse, and so none and unreachable are irrelevant. |
| // * Ignore local.get and set, as those are the things we optimize to. |
| // * Ignore constants so that we don't undo the effects of constant |
| // propagation. |
| // * Ignore things we cannot put in a local, as then we can't do this |
| // optimization at all. |
| // |
| // More things matter here, like having side effects or not, but computing |
| // them is not cheap, so leave them for later, after we know if there |
| // actually are any requests for reuse of this value (which is rare). |
| if (!curr->type.isConcrete() || curr->is<LocalGet>() || |
| curr->is<LocalSet>() || Properties::isConstantExpression(curr) || |
| !TypeUpdating::canHandleAsLocal(curr->type)) { |
| return false; |
| } |
| |
| // If the size is at least 3, then if we have two of them we have 6, |
| // and so adding one set+one get and removing one of the items itself |
| // is not detrimental, and may be beneficial. |
| // TODO: investigate size 2 |
| auto size = Measurer::measure(curr); |
| if (options.shrinkLevel > 0 && size >= 3) { |
| return true; |
| } |
| |
| // If we focus on speed, any reduction in cost is beneficial, as the |
| // cost of a get is essentially free. However, we need to balance that with |
| // the fact that the VM will also do CSE/GVN itself, so minor improvements |
| // are not worthwhile, so skip things of size 1 (like a global.get). |
| if (options.shrinkLevel == 0 && CostAnalyzer(curr).cost > 0 && size >= 2) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| // Some things are not possible, and also prevent their parents from being |
| // possible as well. This is different from isRelevant in that relevance is |
| // considered for the entire expression, including children - e.g., is the |
| // total size big enough - while isPossible checks conditions that prevent |
| // using an expression at all. |
| bool isPossible(Expression* curr) { |
| // We will fully compute effects later, but consider shallow effects at this |
| // early time to ignore things that cannot be optimized later, because we |
| // use a greedy algorithm. Specifically, imagine we see this: |
| // |
| // (call |
| // (i32.add |
| // .. |
| // ) |
| // ) |
| // |
| // If we considered the call relevant then we'd start to look for that |
| // larger pattern that contains the add, but then when we find that it |
| // cannot be optimized later it is too late for the add. (Instead of |
| // checking effects here we could perhaps add backtracking, but that sounds |
| // more complex.) |
| // |
| // We use |hasNonTrapSideEffects| because if a trap occurs the optimization |
| // remains valid: both this and the copy of it would trap, which means the |
| // first traps and the second isn't reached anyhow. |
| // |
| // (We don't stash these effects because we may compute many of them here, |
| // and only need the few for those patterns that repeat.) |
| if (ShallowEffectAnalyzer(options, *getModule(), curr) |
| .hasNonTrapSideEffects()) { |
| return false; |
| } |
| |
| // We also cannot optimize away something that is intrinsically |
| // nondeterministic: even if it has no side effects, if it may return a |
| // different result each time, and then we cannot optimize away repeats. |
| if (Properties::isShallowlyGenerative(curr)) { |
| return false; |
| } |
| |
| return true; |
| } |
| }; |
| |
| // Check for invalidations due to effects. We do this after scanning as effect |
| // computation is not cheap, and there are usually not many identical fragments |
| // of code. |
| // |
| // This updates the RequestInfos of things it sees are invalidated, which will |
| // make Applier ignore them. |
| struct Checker |
| : public LinearExecutionWalker<Checker, UnifiedExpressionVisitor<Checker>> { |
| PassOptions& options; |
| RequestInfoMap& requestInfos; |
| |
| Checker(PassOptions& options, RequestInfoMap& requestInfos) |
| : options(options), requestInfos(requestInfos) {} |
| |
| struct ActiveOriginalInfo { |
| // How many of the requests remain to be seen during our walk. When this |
| // reaches 0, we know that the original is no longer requested from later in |
| // the block. |
| Index requestsLeft; |
| |
| // The effects in the expression. |
| EffectAnalyzer effects; |
| }; |
| |
| // The currently relevant original expressions, that is, the ones that may be |
| // optimized in the current basic block. |
| std::unordered_map<Expression*, ActiveOriginalInfo> activeOriginals; |
| |
| void visitExpression(Expression* curr) { |
| // This is the first time we encounter this expression. |
| assert(!activeOriginals.count(curr)); |
| |
| // Given the current expression, see what it invalidates of the currently- |
| // hashed expressions, if there are any. |
| if (!activeOriginals.empty()) { |
| EffectAnalyzer effects(options, *getModule()); |
| // We can ignore traps here: |
| // |
| // (ORIGINAL) |
| // (curr) |
| // (COPY) |
| // |
| // We are some code in between an original and a copy of it, and we are |
| // trying to turn COPY into a local.get of a value that we stash at the |
| // original. If |curr| traps then we simply don't reach the copy anyhow. |
| effects.trap = false; |
| // We only need to visit this node itself, as we have already visited its |
| // children by the time we get here. |
| effects.visit(curr); |
| |
| std::vector<Expression*> invalidated; |
| for (auto& kv : activeOriginals) { |
| auto* original = kv.first; |
| auto& originalInfo = kv.second; |
| if (effects.invalidates(originalInfo.effects)) { |
| invalidated.push_back(original); |
| } |
| } |
| |
| for (auto* original : invalidated) { |
| // Remove all requests after this expression, as we cannot optimize to |
| // them. |
| requestInfos[original].requests -= |
| activeOriginals.at(original).requestsLeft; |
| |
| // If no requests remain at all (that is, there were no requests we |
| // could provide before we ran into this invalidation) then we do not |
| // need this original at all. |
| if (requestInfos[original].requests == 0) { |
| requestInfos.erase(original); |
| } |
| |
| activeOriginals.erase(original); |
| } |
| } |
| |
| auto iter = requestInfos.find(curr); |
| if (iter == requestInfos.end()) { |
| return; |
| } |
| auto& info = iter->second; |
| info.validate(); |
| |
| if (info.requests > 0) { |
| // This is an original. Compute its side effects, as we cannot optimize |
| // away repeated appearances if it has any. |
| EffectAnalyzer effects(options, *getModule(), curr); |
| |
| // We can ignore traps here, as we replace a repeating expression with a |
| // single appearance of it, a store to a local, and gets in the other |
| // locations, and so if the expression traps then the first appearance - |
| // that we keep around - would trap, and the others are never reached |
| // anyhow. (The other checks we perform here, including invalidation and |
| // determinism, will ensure that either all of the appearances trap, or |
| // none of them.) |
| effects.trap = false; |
| |
| // Note that we've already checked above that this has no side effects or |
| // generativity: if we got here, then it is good to go from the |
| // perspective of this expression itself (but may be invalidated by other |
| // code in between, see above). |
| activeOriginals.emplace( |
| curr, ActiveOriginalInfo{info.requests, std::move(effects)}); |
| } else if (info.original) { |
| // The original may have already been invalidated. If so, remove our info |
| // as well. |
| auto originalIter = activeOriginals.find(info.original); |
| if (originalIter == activeOriginals.end()) { |
| requestInfos.erase(iter); |
| return; |
| } |
| |
| // After visiting this expression, we have one less request for its |
| // original, and perhaps none are left. |
| auto& originalInfo = originalIter->second; |
| if (originalInfo.requestsLeft == 1) { |
| activeOriginals.erase(info.original); |
| } else { |
| originalInfo.requestsLeft--; |
| } |
| } |
| } |
| |
| static void doNoteNonLinear(Checker* self, Expression** currp) { |
| // Between basic blocks there can be no active originals. |
| assert(self->activeOriginals.empty()); |
| } |
| |
| // See the same code above. |
| bool connectAdjacentBlocks = true; |
| |
| void visitFunction(Function* curr) { |
| // At the end of the function there can be no active originals. |
| assert(activeOriginals.empty()); |
| } |
| }; |
| |
| // Applies the optimization now that we know which requests are valid. |
| struct Applier |
| : public LinearExecutionWalker<Applier, UnifiedExpressionVisitor<Applier>> { |
| RequestInfoMap requestInfos; |
| |
| Applier(RequestInfoMap& requestInfos) : requestInfos(requestInfos) {} |
| |
| // Maps the original expressions that we save to locals to the local indexes |
| // for them. |
| std::unordered_map<Expression*, Index> originalLocalMap; |
| |
| void visitExpression(Expression* curr) { |
| auto iter = requestInfos.find(curr); |
| if (iter == requestInfos.end()) { |
| return; |
| } |
| |
| const auto& info = iter->second; |
| info.validate(); |
| |
| if (info.requests) { |
| // We have requests for this value. Add a local and tee the value to |
| // there. |
| Index local = originalLocalMap[curr] = |
| Builder::addVar(getFunction(), curr->type); |
| replaceCurrent( |
| Builder(*getModule()).makeLocalTee(local, curr, curr->type)); |
| } else if (info.original) { |
| auto& originalInfo = requestInfos.at(info.original); |
| if (originalInfo.requests) { |
| // This is a valid request of an original value. Get the value from the |
| // local. |
| assert(originalLocalMap.count(info.original)); |
| replaceCurrent( |
| Builder(*getModule()) |
| .makeLocalGet(originalLocalMap[info.original], curr->type)); |
| originalInfo.requests--; |
| } |
| } |
| } |
| |
| static void doNoteNonLinear(Applier* self, Expression** currp) { |
| // Clear the state between blocks. |
| self->originalLocalMap.clear(); |
| } |
| |
| // See the same code above. |
| bool connectAdjacentBlocks = true; |
| }; |
| |
| } // anonymous namespace |
| |
| struct LocalCSE : public WalkerPass<PostWalker<LocalCSE>> { |
| bool isFunctionParallel() override { return true; } |
| |
| // FIXME DWARF updating does not handle local changes yet. |
| bool invalidatesDWARF() override { return true; } |
| |
| std::unique_ptr<Pass> create() override { |
| return std::make_unique<LocalCSE>(); |
| } |
| |
| void doWalkFunction(Function* func) { |
| auto& options = getPassOptions(); |
| |
| RequestInfoMap requestInfos; |
| |
| Scanner scanner(options, requestInfos); |
| scanner.walkFunctionInModule(func, getModule()); |
| if (requestInfos.empty()) { |
| // We did not find any repeated expressions at all. |
| return; |
| } |
| |
| Checker checker(options, requestInfos); |
| checker.walkFunctionInModule(func, getModule()); |
| if (requestInfos.empty()) { |
| // No repeated expressions remain after checking for effects. |
| return; |
| } |
| |
| Applier applier(requestInfos); |
| applier.walkFunctionInModule(func, getModule()); |
| } |
| }; |
| |
| Pass* createLocalCSEPass() { return new LocalCSE(); } |
| |
| } // namespace wasm |