//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This template implementation resides in a separate file so that it // does not get injected into every .cpp file that includes the // generic header. // // DO NOT INCLUDE THIS FILE WHEN MERELY USING UNIFORMITYINFO. // // This file should only be included by files that implement a // specialization of the relvant templates. Currently these are: // - UniformityAnalysis.cpp // // Note: The DEBUG_TYPE macro should be defined before using this // file so that any use of LLVM_DEBUG is associated with the // including file rather than this file. // //===----------------------------------------------------------------------===// /// /// \file /// \brief Implementation of uniformity analysis. /// /// The algorithm is a fixed point iteration that starts with the assumption /// that all control flow and all values are uniform. Starting from sources of /// divergence (whose discovery must be implemented by a CFG- or even /// target-specific derived class), divergence of values is propagated from /// definition to uses in a straight-forward way. The main complexity lies in /// the propagation of the impact of divergent control flow on the divergence of /// values (sync dependencies). /// /// NOTE: In general, no interface exists for a transform to update /// (Machine)UniformityInfo. Additionally, (Machine)CycleAnalysis is a /// transitive dependence, but it also does not provide an interface for /// updating itself. Given that, transforms should not preserve uniformity in /// their getAnalysisUsage() callback. /// //===----------------------------------------------------------------------===// #ifndef LLVM_ADT_GENERICUNIFORMITYIMPL_H #define LLVM_ADT_GENERICUNIFORMITYIMPL_H #include "llvm/ADT/GenericUniformityInfo.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/raw_ostream.h" #include #define DEBUG_TYPE "uniformity" namespace llvm { template auto unique(Range &&R) { return std::unique(adl_begin(R), adl_end(R)); } /// Construct a specially modified post-order traversal of cycles. /// /// The ModifiedPO is contructed using a virtually modified CFG as follows: /// /// 1. The successors of pre-entry nodes (predecessors of an cycle /// entry that are outside the cycle) are replaced by the /// successors of the successors of the header. /// 2. Successors of the cycle header are replaced by the exit blocks /// of the cycle. /// /// Effectively, we produce a depth-first numbering with the following /// properties: /// /// 1. Nodes after a cycle are numbered earlier than the cycle header. /// 2. The header is numbered earlier than the nodes in the cycle. /// 3. The numbering of the nodes within the cycle forms an interval /// starting with the header. /// /// Effectively, the virtual modification arranges the nodes in a /// cycle as a DAG with the header as the sole leaf, and successors of /// the header as the roots. A reverse traversal of this numbering has /// the following invariant on the unmodified original CFG: /// /// Each node is visited after all its predecessors, except if that /// predecessor is the cycle header. /// template class ModifiedPostOrder { public: using BlockT = typename ContextT::BlockT; using FunctionT = typename ContextT::FunctionT; using DominatorTreeT = typename ContextT::DominatorTreeT; using CycleInfoT = GenericCycleInfo; using CycleT = typename CycleInfoT::CycleT; using const_iterator = typename std::vector::const_iterator; ModifiedPostOrder(const ContextT &C) : Context(C) {} bool empty() const { return m_order.empty(); } size_t size() const { return m_order.size(); } void clear() { m_order.clear(); } void compute(const CycleInfoT &CI); unsigned count(BlockT *BB) const { return POIndex.count(BB); } const BlockT *operator[](size_t idx) const { return m_order[idx]; } void appendBlock(const BlockT &BB, bool isReducibleCycleHeader = false) { POIndex[&BB] = m_order.size(); m_order.push_back(&BB); LLVM_DEBUG(dbgs() << "ModifiedPO(" << POIndex[&BB] << "): " << Context.print(&BB) << "\n"); if (isReducibleCycleHeader) ReducibleCycleHeaders.insert(&BB); } unsigned getIndex(const BlockT *BB) const { assert(POIndex.count(BB)); return POIndex.lookup(BB); } bool isReducibleCycleHeader(const BlockT *BB) const { return ReducibleCycleHeaders.contains(BB); } private: SmallVector m_order; DenseMap POIndex; SmallPtrSet ReducibleCycleHeaders; const ContextT &Context; void computeCyclePO(const CycleInfoT &CI, const CycleT *Cycle, SmallPtrSetImpl &Finalized); void computeStackPO(SmallVectorImpl &Stack, const CycleInfoT &CI, const CycleT *Cycle, SmallPtrSetImpl &Finalized); }; template class DivergencePropagator; /// \class GenericSyncDependenceAnalysis /// /// \brief Locate join blocks for disjoint paths starting at a divergent branch. /// /// An analysis per divergent branch that returns the set of basic /// blocks whose phi nodes become divergent due to divergent control. /// These are the blocks that are reachable by two disjoint paths from /// the branch, or cycle exits reachable along a path that is disjoint /// from a path to the cycle latch. // --- Above line is not a doxygen comment; intentionally left blank --- // // Originally implemented in SyncDependenceAnalysis.cpp for DivergenceAnalysis. // // The SyncDependenceAnalysis is used in the UniformityAnalysis to model // control-induced divergence in phi nodes. // // -- Reference -- // The algorithm is an extension of Section 5 of // // An abstract interpretation for SPMD divergence // on reducible control flow graphs. // Julian Rosemann, Simon Moll and Sebastian Hack // POPL '21 // // // -- Sync dependence -- // Sync dependence characterizes the control flow aspect of the // propagation of branch divergence. For example, // // %cond = icmp slt i32 %tid, 10 // br i1 %cond, label %then, label %else // then: // br label %merge // else: // br label %merge // merge: // %a = phi i32 [ 0, %then ], [ 1, %else ] // // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid // because %tid is not on its use-def chains, %a is sync dependent on %tid // because the branch "br i1 %cond" depends on %tid and affects which value %a // is assigned to. // // // -- Reduction to SSA construction -- // There are two disjoint paths from A to X, if a certain variant of SSA // construction places a phi node in X under the following set-up scheme. // // This variant of SSA construction ignores incoming undef values. // That is paths from the entry without a definition do not result in // phi nodes. // // entry // / \ // A \ // / \ Y // B C / // \ / \ / // D E // \ / // F // // Assume that A contains a divergent branch. We are interested // in the set of all blocks where each block is reachable from A // via two disjoint paths. This would be the set {D, F} in this // case. // To generally reduce this query to SSA construction we introduce // a virtual variable x and assign to x different values in each // successor block of A. // // entry // / \ // A \ // / \ Y // x = 0 x = 1 / // \ / \ / // D E // \ / // F // // Our flavor of SSA construction for x will construct the following // // entry // / \ // A \ // / \ Y // x0 = 0 x1 = 1 / // \ / \ / // x2 = phi E // \ / // x3 = phi // // The blocks D and F contain phi nodes and are thus each reachable // by two disjoins paths from A. // // -- Remarks -- // * In case of cycle exits we need to check for temporal divergence. // To this end, we check whether the definition of x differs between the // cycle exit and the cycle header (_after_ SSA construction). // // * In the presence of irreducible control flow, the fixed point is // reached only after multiple iterations. This is because labels // reaching the header of a cycle must be repropagated through the // cycle. This is true even in a reducible cycle, since the labels // may have been produced by a nested irreducible cycle. // // * Note that SyncDependenceAnalysis is not concerned with the points // of convergence in an irreducible cycle. It's only purpose is to // identify join blocks. The "diverged entry" criterion is // separately applied on join blocks to determine if an entire // irreducible cycle is assumed to be divergent. // // * Relevant related work: // A simple algorithm for global data flow analysis problems. // Matthew S. Hecht and Jeffrey D. Ullman. // SIAM Journal on Computing, 4(4):519–532, December 1975. // template class GenericSyncDependenceAnalysis { public: using BlockT = typename ContextT::BlockT; using DominatorTreeT = typename ContextT::DominatorTreeT; using FunctionT = typename ContextT::FunctionT; using ValueRefT = typename ContextT::ValueRefT; using InstructionT = typename ContextT::InstructionT; using CycleInfoT = GenericCycleInfo; using CycleT = typename CycleInfoT::CycleT; using ConstBlockSet = SmallPtrSet; using ModifiedPO = ModifiedPostOrder; // * if BlockLabels[B] == C then C is the dominating definition at // block B // * if BlockLabels[B] == nullptr then we haven't seen B yet // * if BlockLabels[B] == B then: // - B is a join point of disjoint paths from X, or, // - B is an immediate successor of X (initial value), or, // - B is X using BlockLabelMap = DenseMap; /// Information discovered by the sync dependence analysis for each /// divergent branch. struct DivergenceDescriptor { // Join points of diverged paths. ConstBlockSet JoinDivBlocks; // Divergent cycle exits ConstBlockSet CycleDivBlocks; // Labels assigned to blocks on diverged paths. BlockLabelMap BlockLabels; }; using DivergencePropagatorT = DivergencePropagator; GenericSyncDependenceAnalysis(const ContextT &Context, const DominatorTreeT &DT, const CycleInfoT &CI); /// \brief Computes divergent join points and cycle exits caused by branch /// divergence in \p Term. /// /// This returns a pair of sets: /// * The set of blocks which are reachable by disjoint paths from /// \p Term. /// * The set also contains cycle exits if there two disjoint paths: /// one from \p Term to the cycle exit and another from \p Term to /// the cycle header. const DivergenceDescriptor &getJoinBlocks(const BlockT *DivTermBlock); private: static DivergenceDescriptor EmptyDivergenceDesc; ModifiedPO CyclePO; const DominatorTreeT &DT; const CycleInfoT &CI; DenseMap> CachedControlDivDescs; }; /// \brief Analysis that identifies uniform values in a data-parallel /// execution. /// /// This analysis propagates divergence in a data-parallel context /// from sources of divergence to all users. It can be instantiated /// for an IR that provides a suitable SSAContext. template class GenericUniformityAnalysisImpl { public: using BlockT = typename ContextT::BlockT; using FunctionT = typename ContextT::FunctionT; using ValueRefT = typename ContextT::ValueRefT; using ConstValueRefT = typename ContextT::ConstValueRefT; using UseT = typename ContextT::UseT; using InstructionT = typename ContextT::InstructionT; using DominatorTreeT = typename ContextT::DominatorTreeT; using CycleInfoT = GenericCycleInfo; using CycleT = typename CycleInfoT::CycleT; using SyncDependenceAnalysisT = GenericSyncDependenceAnalysis; using DivergenceDescriptorT = typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), TTI(TTI), DT(DT), SDA(Context, DT, CI) {} void initialize(); const FunctionT &getFunction() const { return F; } /// \brief Mark \p UniVal as a value that is always uniform. void addUniformOverride(const InstructionT &Instr); /// \brief Examine \p I for divergent outputs and add to the worklist. void markDivergent(const InstructionT &I); /// \brief Mark \p DivVal as a divergent value. /// \returns Whether the tracked divergence state of \p DivVal changed. bool markDivergent(ConstValueRefT DivVal); /// \brief Mark outputs of \p Instr as divergent. /// \returns Whether the tracked divergence state of any output has changed. bool markDefsDivergent(const InstructionT &Instr); /// \brief Propagate divergence to all instructions in the region. /// Divergence is seeded by calls to \p markDivergent. void compute(); /// \brief Whether any value was marked or analyzed to be divergent. bool hasDivergence() const { return !DivergentValues.empty(); } /// \brief Whether \p Val will always return a uniform value regardless of its /// operands bool isAlwaysUniform(const InstructionT &Instr) const; bool hasDivergentDefs(const InstructionT &I) const; bool isDivergent(const InstructionT &I) const { if (I.isTerminator()) { return DivergentTermBlocks.contains(I.getParent()); } return hasDivergentDefs(I); }; /// \brief Whether \p Val is divergent at its definition. bool isDivergent(ConstValueRefT V) const { return DivergentValues.count(V); } bool isDivergentUse(const UseT &U) const; bool hasDivergentTerminator(const BlockT &B) const { return DivergentTermBlocks.contains(&B); } void print(raw_ostream &out) const; protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { ConstValueRefT value; BlockT *predBlock; PhiInput(ConstValueRefT value, BlockT *predBlock) : value(value), predBlock(predBlock) {} }; const ContextT &Context; const FunctionT &F; const CycleInfoT &CI; const TargetTransformInfo *TTI = nullptr; // Detected/marked divergent values. std::set DivergentValues; SmallPtrSet DivergentTermBlocks; // Internal worklist for divergence propagation. std::vector Worklist; /// \brief Mark \p Term as divergent and push all Instructions that become /// divergent as a result on the worklist. void analyzeControlDivergence(const InstructionT &Term); private: const DominatorTreeT &DT; // Recognized cycles with divergent exits. SmallPtrSet DivergentExitCycles; // Cycles assumed to be divergent. // // We don't use a set here because every insertion needs an explicit // traversal of all existing members. SmallVector AssumedDivergent; // The SDA links divergent branches to divergent control-flow joins. SyncDependenceAnalysisT SDA; // Set of known-uniform values. SmallPtrSet UniformOverrides; /// \brief Mark all nodes in \p JoinBlock as divergent and push them on /// the worklist. void taintAndPushAllDefs(const BlockT &JoinBlock); /// \brief Mark all phi nodes in \p JoinBlock as divergent and push them on /// the worklist. void taintAndPushPhiNodes(const BlockT &JoinBlock); /// \brief Identify all Instructions that become divergent because \p DivExit /// is a divergent cycle exit of \p DivCycle. Mark those instructions as /// divergent and push them on the worklist. void propagateCycleExitDivergence(const BlockT &DivExit, const CycleT &DivCycle); /// Mark as divergent all external uses of values defined in \p DefCycle. void analyzeCycleExitDivergence(const CycleT &DefCycle); /// \brief Mark as divergent all uses of \p I that are outside \p DefCycle. void propagateTemporalDivergence(const InstructionT &I, const CycleT &DefCycle); /// \brief Push all users of \p Val (in the region) to the worklist. void pushUsers(const InstructionT &I); void pushUsers(ConstValueRefT V); bool usesValueFromCycle(const InstructionT &I, const CycleT &DefCycle) const; /// \brief Whether \p Def is divergent when read in \p ObservingBlock. bool isTemporalDivergent(const BlockT &ObservingBlock, const InstructionT &Def) const; }; template void GenericUniformityAnalysisImplDeleter::operator()(ImplT *Impl) { delete Impl; } /// Compute divergence starting with a divergent branch. template class DivergencePropagator { public: using BlockT = typename ContextT::BlockT; using DominatorTreeT = typename ContextT::DominatorTreeT; using FunctionT = typename ContextT::FunctionT; using ValueRefT = typename ContextT::ValueRefT; using CycleInfoT = GenericCycleInfo; using CycleT = typename CycleInfoT::CycleT; using ModifiedPO = ModifiedPostOrder; using SyncDependenceAnalysisT = GenericSyncDependenceAnalysis; using DivergenceDescriptorT = typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; const ModifiedPO &CyclePOT; const DominatorTreeT &DT; const CycleInfoT &CI; const BlockT &DivTermBlock; const ContextT &Context; // Track blocks that receive a new label. Every time we relabel a // cycle header, we another pass over the modified post-order in // order to propagate the header label. The bit vector also allows // us to skip labels that have not changed. SparseBitVector<> FreshLabels; // divergent join and cycle exit descriptor. std::unique_ptr DivDesc; BlockLabelMapT &BlockLabels; DivergencePropagator(const ModifiedPO &CyclePOT, const DominatorTreeT &DT, const CycleInfoT &CI, const BlockT &DivTermBlock) : CyclePOT(CyclePOT), DT(DT), CI(CI), DivTermBlock(DivTermBlock), Context(CI.getSSAContext()), DivDesc(new DivergenceDescriptorT), BlockLabels(DivDesc->BlockLabels) {} void printDefs(raw_ostream &Out) { Out << "Propagator::BlockLabels {\n"; for (int BlockIdx = (int)CyclePOT.size() - 1; BlockIdx >= 0; --BlockIdx) { const auto *Block = CyclePOT[BlockIdx]; const auto *Label = BlockLabels[Block]; Out << Context.print(Block) << "(" << BlockIdx << ") : "; if (!Label) { Out << "\n"; } else { Out << Context.print(Label) << "\n"; } } Out << "}\n"; } // Push a definition (\p PushedLabel) to \p SuccBlock and return whether this // causes a divergent join. bool computeJoin(const BlockT &SuccBlock, const BlockT &PushedLabel) { const auto *OldLabel = BlockLabels[&SuccBlock]; LLVM_DEBUG(dbgs() << "labeling " << Context.print(&SuccBlock) << ":\n" << "\tpushed label: " << Context.print(&PushedLabel) << "\n" << "\told label: " << Context.print(OldLabel) << "\n"); // Early exit if there is no change in the label. if (OldLabel == &PushedLabel) return false; if (OldLabel != &SuccBlock) { auto SuccIdx = CyclePOT.getIndex(&SuccBlock); // Assigning a new label, mark this in FreshLabels. LLVM_DEBUG(dbgs() << "\tfresh label: " << SuccIdx << "\n"); FreshLabels.set(SuccIdx); } // This is not a join if the succ was previously unlabeled. if (!OldLabel) { LLVM_DEBUG(dbgs() << "\tnew label: " << Context.print(&PushedLabel) << "\n"); BlockLabels[&SuccBlock] = &PushedLabel; return false; } // This is a new join. Label the join block as itself, and not as // the pushed label. LLVM_DEBUG(dbgs() << "\tnew label: " << Context.print(&SuccBlock) << "\n"); BlockLabels[&SuccBlock] = &SuccBlock; return true; } // visiting a virtual cycle exit edge from the cycle header --> temporal // divergence on join bool visitCycleExitEdge(const BlockT &ExitBlock, const BlockT &Label) { if (!computeJoin(ExitBlock, Label)) return false; // Identified a divergent cycle exit DivDesc->CycleDivBlocks.insert(&ExitBlock); LLVM_DEBUG(dbgs() << "\tDivergent cycle exit: " << Context.print(&ExitBlock) << "\n"); return true; } // process \p SuccBlock with reaching definition \p Label bool visitEdge(const BlockT &SuccBlock, const BlockT &Label) { if (!computeJoin(SuccBlock, Label)) return false; // Divergent, disjoint paths join. DivDesc->JoinDivBlocks.insert(&SuccBlock); LLVM_DEBUG(dbgs() << "\tDivergent join: " << Context.print(&SuccBlock) << "\n"); return true; } std::unique_ptr computeJoinPoints() { assert(DivDesc); LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: " << Context.print(&DivTermBlock) << "\n"); // Early stopping criterion int FloorIdx = CyclePOT.size() - 1; const BlockT *FloorLabel = nullptr; int DivTermIdx = CyclePOT.getIndex(&DivTermBlock); // Bootstrap with branch targets auto const *DivTermCycle = CI.getCycle(&DivTermBlock); for (const auto *SuccBlock : successors(&DivTermBlock)) { if (DivTermCycle && !DivTermCycle->contains(SuccBlock)) { // If DivTerm exits the cycle immediately, computeJoin() might // not reach SuccBlock with a different label. We need to // check for this exit now. DivDesc->CycleDivBlocks.insert(SuccBlock); LLVM_DEBUG(dbgs() << "\tImmediate divergent cycle exit: " << Context.print(SuccBlock) << "\n"); } auto SuccIdx = CyclePOT.getIndex(SuccBlock); visitEdge(*SuccBlock, *SuccBlock); FloorIdx = std::min(FloorIdx, SuccIdx); } while (true) { auto BlockIdx = FreshLabels.find_last(); if (BlockIdx == -1 || BlockIdx < FloorIdx) break; LLVM_DEBUG(dbgs() << "Current labels:\n"; printDefs(dbgs())); FreshLabels.reset(BlockIdx); if (BlockIdx == DivTermIdx) { LLVM_DEBUG(dbgs() << "Skipping DivTermBlock\n"); continue; } const auto *Block = CyclePOT[BlockIdx]; LLVM_DEBUG(dbgs() << "visiting " << Context.print(Block) << " at index " << BlockIdx << "\n"); const auto *Label = BlockLabels[Block]; assert(Label); bool CausedJoin = false; int LoweredFloorIdx = FloorIdx; // If the current block is the header of a reducible cycle that // contains the divergent branch, then the label should be // propagated to the cycle exits. Such a header is the "last // possible join" of any disjoint paths within this cycle. This // prevents detection of spurious joins at the entries of any // irreducible child cycles. // // This conclusion about the header is true for any choice of DFS: // // If some DFS has a reducible cycle C with header H, then for // any other DFS, H is the header of a cycle C' that is a // superset of C. For a divergent branch inside the subgraph // C, any join node inside C is either H, or some node // encountered without passing through H. // auto getReducibleParent = [&](const BlockT *Block) -> const CycleT * { if (!CyclePOT.isReducibleCycleHeader(Block)) return nullptr; const auto *BlockCycle = CI.getCycle(Block); if (BlockCycle->contains(&DivTermBlock)) return BlockCycle; return nullptr; }; if (const auto *BlockCycle = getReducibleParent(Block)) { SmallVector BlockCycleExits; BlockCycle->getExitBlocks(BlockCycleExits); for (auto *BlockCycleExit : BlockCycleExits) { CausedJoin |= visitCycleExitEdge(*BlockCycleExit, *Label); LoweredFloorIdx = std::min(LoweredFloorIdx, CyclePOT.getIndex(BlockCycleExit)); } } else { for (const auto *SuccBlock : successors(Block)) { CausedJoin |= visitEdge(*SuccBlock, *Label); LoweredFloorIdx = std::min(LoweredFloorIdx, CyclePOT.getIndex(SuccBlock)); } } // Floor update if (CausedJoin) { // 1. Different labels pushed to successors FloorIdx = LoweredFloorIdx; } else if (FloorLabel != Label) { // 2. No join caused BUT we pushed a label that is different than the // last pushed label FloorIdx = LoweredFloorIdx; FloorLabel = Label; } } LLVM_DEBUG(dbgs() << "Final labeling:\n"; printDefs(dbgs())); // Check every cycle containing DivTermBlock for exit divergence. // A cycle has exit divergence if the label of an exit block does // not match the label of its header. for (const auto *Cycle = CI.getCycle(&DivTermBlock); Cycle; Cycle = Cycle->getParentCycle()) { if (Cycle->isReducible()) { // The exit divergence of a reducible cycle is recorded while // propagating labels. continue; } SmallVector Exits; Cycle->getExitBlocks(Exits); auto *Header = Cycle->getHeader(); auto *HeaderLabel = BlockLabels[Header]; for (const auto *Exit : Exits) { if (BlockLabels[Exit] != HeaderLabel) { // Identified a divergent cycle exit DivDesc->CycleDivBlocks.insert(Exit); LLVM_DEBUG(dbgs() << "\tDivergent cycle exit: " << Context.print(Exit) << "\n"); } } } return std::move(DivDesc); } }; template typename llvm::GenericSyncDependenceAnalysis::DivergenceDescriptor llvm::GenericSyncDependenceAnalysis::EmptyDivergenceDesc; template llvm::GenericSyncDependenceAnalysis::GenericSyncDependenceAnalysis( const ContextT &Context, const DominatorTreeT &DT, const CycleInfoT &CI) : CyclePO(Context), DT(DT), CI(CI) { CyclePO.compute(CI); } template auto llvm::GenericSyncDependenceAnalysis::getJoinBlocks( const BlockT *DivTermBlock) -> const DivergenceDescriptor & { // trivial case if (succ_size(DivTermBlock) <= 1) { return EmptyDivergenceDesc; } // already available in cache? auto ItCached = CachedControlDivDescs.find(DivTermBlock); if (ItCached != CachedControlDivDescs.end()) return *ItCached->second; // compute all join points DivergencePropagatorT Propagator(CyclePO, DT, CI, *DivTermBlock); auto DivDesc = Propagator.computeJoinPoints(); auto printBlockSet = [&](ConstBlockSet &Blocks) { return Printable([&](raw_ostream &Out) { Out << "["; ListSeparator LS; for (const auto *BB : Blocks) { Out << LS << CI.getSSAContext().print(BB); } Out << "]\n"; }); }; LLVM_DEBUG( dbgs() << "\nResult (" << CI.getSSAContext().print(DivTermBlock) << "):\n JoinDivBlocks: " << printBlockSet(DivDesc->JoinDivBlocks) << " CycleDivBlocks: " << printBlockSet(DivDesc->CycleDivBlocks) << "\n"); (void)printBlockSet; auto ItInserted = CachedControlDivDescs.try_emplace(DivTermBlock, std::move(DivDesc)); assert(ItInserted.second); return *ItInserted.first->second; } template void GenericUniformityAnalysisImpl::markDivergent( const InstructionT &I) { if (isAlwaysUniform(I)) return; bool Marked = false; if (I.isTerminator()) { Marked = DivergentTermBlocks.insert(I.getParent()).second; if (Marked) { LLVM_DEBUG(dbgs() << "marked divergent term block: " << Context.print(I.getParent()) << "\n"); } } else { Marked = markDefsDivergent(I); } if (Marked) Worklist.push_back(&I); } template bool GenericUniformityAnalysisImpl::markDivergent( ConstValueRefT Val) { if (DivergentValues.insert(Val).second) { LLVM_DEBUG(dbgs() << "marked divergent: " << Context.print(Val) << "\n"); return true; } return false; } template void GenericUniformityAnalysisImpl::addUniformOverride( const InstructionT &Instr) { UniformOverrides.insert(&Instr); } // Mark as divergent all external uses of values defined in \p DefCycle. // // A value V defined by a block B inside \p DefCycle may be used outside the // cycle only if the use is a PHI in some exit block, or B dominates some exit // block. Thus, we check uses as follows: // // - Check all PHIs in all exit blocks for inputs defined inside \p DefCycle. // - For every block B inside \p DefCycle that dominates at least one exit // block, check all uses outside \p DefCycle. // // FIXME: This function does not distinguish between divergent and uniform // exits. For each divergent exit, only the values that are live at that exit // need to be propagated as divergent at their use outside the cycle. template void GenericUniformityAnalysisImpl::analyzeCycleExitDivergence( const CycleT &DefCycle) { SmallVector Exits; DefCycle.getExitBlocks(Exits); for (auto *Exit : Exits) { for (auto &Phi : Exit->phis()) { if (usesValueFromCycle(Phi, DefCycle)) { markDivergent(Phi); } } } for (auto *BB : DefCycle.blocks()) { if (!llvm::any_of(Exits, [&](BlockT *Exit) { return DT.dominates(BB, Exit); })) continue; for (auto &II : *BB) { propagateTemporalDivergence(II, DefCycle); } } } template void GenericUniformityAnalysisImpl::propagateCycleExitDivergence( const BlockT &DivExit, const CycleT &InnerDivCycle) { LLVM_DEBUG(dbgs() << "\tpropCycleExitDiv " << Context.print(&DivExit) << "\n"); auto *DivCycle = &InnerDivCycle; auto *OuterDivCycle = DivCycle; auto *ExitLevelCycle = CI.getCycle(&DivExit); const unsigned CycleExitDepth = ExitLevelCycle ? ExitLevelCycle->getDepth() : 0; // Find outer-most cycle that does not contain \p DivExit while (DivCycle && DivCycle->getDepth() > CycleExitDepth) { LLVM_DEBUG(dbgs() << " Found exiting cycle: " << Context.print(DivCycle->getHeader()) << "\n"); OuterDivCycle = DivCycle; DivCycle = DivCycle->getParentCycle(); } LLVM_DEBUG(dbgs() << "\tOuter-most exiting cycle: " << Context.print(OuterDivCycle->getHeader()) << "\n"); if (!DivergentExitCycles.insert(OuterDivCycle).second) return; // Exit divergence does not matter if the cycle itself is assumed to // be divergent. for (const auto *C : AssumedDivergent) { if (C->contains(OuterDivCycle)) return; } analyzeCycleExitDivergence(*OuterDivCycle); } template void GenericUniformityAnalysisImpl::taintAndPushAllDefs( const BlockT &BB) { LLVM_DEBUG(dbgs() << "taintAndPushAllDefs " << Context.print(&BB) << "\n"); for (const auto &I : instrs(BB)) { // Terminators do not produce values; they are divergent only if // the condition is divergent. That is handled when the divergent // condition is placed in the worklist. if (I.isTerminator()) break; markDivergent(I); } } /// Mark divergent phi nodes in a join block template void GenericUniformityAnalysisImpl::taintAndPushPhiNodes( const BlockT &JoinBlock) { LLVM_DEBUG(dbgs() << "taintAndPushPhiNodes in " << Context.print(&JoinBlock) << "\n"); for (const auto &Phi : JoinBlock.phis()) { // FIXME: The non-undef value is not constant per se; it just happens to be // uniform and may not dominate this PHI. So assuming that the same value // reaches along all incoming edges may itself be undefined behaviour. This // particular interpretation of the undef value was added to // DivergenceAnalysis in the following review: // // https://reviews.llvm.org/D19013 if (ContextT::isConstantOrUndefValuePhi(Phi)) continue; markDivergent(Phi); } } /// Add \p Candidate to \p Cycles if it is not already contained in \p Cycles. /// /// \return true iff \p Candidate was added to \p Cycles. template static bool insertIfNotContained(SmallVector &Cycles, CycleT *Candidate) { if (llvm::any_of(Cycles, [Candidate](CycleT *C) { return C->contains(Candidate); })) return false; Cycles.push_back(Candidate); return true; } /// Return the outermost cycle made divergent by branch outside it. /// /// If two paths that diverged outside an irreducible cycle join /// inside that cycle, then that whole cycle is assumed to be /// divergent. This does not apply if the cycle is reducible. template static const CycleT *getExtDivCycle(const CycleT *Cycle, const BlockT *DivTermBlock, const BlockT *JoinBlock) { assert(Cycle); assert(Cycle->contains(JoinBlock)); if (Cycle->contains(DivTermBlock)) return nullptr; const auto *OriginalCycle = Cycle; const auto *Parent = Cycle->getParentCycle(); while (Parent && !Parent->contains(DivTermBlock)) { Cycle = Parent; Parent = Cycle->getParentCycle(); } // If the original cycle is not the outermost cycle, then the outermost cycle // is irreducible. If the outermost cycle were reducible, then external // diverged paths would not reach the original inner cycle. (void)OriginalCycle; assert(Cycle == OriginalCycle || !Cycle->isReducible()); if (Cycle->isReducible()) { assert(Cycle->getHeader() == JoinBlock); return nullptr; } LLVM_DEBUG(dbgs() << "cycle made divergent by external branch\n"); return Cycle; } /// Return the outermost cycle made divergent by branch inside it. /// /// This checks the "diverged entry" criterion defined in the /// docs/ConvergenceAnalysis.html. template static const CycleT * getIntDivCycle(const CycleT *Cycle, const BlockT *DivTermBlock, const BlockT *JoinBlock, const DominatorTreeT &DT, ContextT &Context) { LLVM_DEBUG(dbgs() << "examine join " << Context.print(JoinBlock) << " for internal branch " << Context.print(DivTermBlock) << "\n"); if (DT.properlyDominates(DivTermBlock, JoinBlock)) return nullptr; // Find the smallest common cycle, if one exists. assert(Cycle && Cycle->contains(JoinBlock)); while (Cycle && !Cycle->contains(DivTermBlock)) { Cycle = Cycle->getParentCycle(); } if (!Cycle || Cycle->isReducible()) return nullptr; if (DT.properlyDominates(Cycle->getHeader(), JoinBlock)) return nullptr; LLVM_DEBUG(dbgs() << " header " << Context.print(Cycle->getHeader()) << " does not dominate join\n"); const auto *Parent = Cycle->getParentCycle(); while (Parent && !DT.properlyDominates(Parent->getHeader(), JoinBlock)) { LLVM_DEBUG(dbgs() << " header " << Context.print(Parent->getHeader()) << " does not dominate join\n"); Cycle = Parent; Parent = Parent->getParentCycle(); } LLVM_DEBUG(dbgs() << " cycle made divergent by internal branch\n"); return Cycle; } template static const CycleT * getOutermostDivergentCycle(const CycleT *Cycle, const BlockT *DivTermBlock, const BlockT *JoinBlock, const DominatorTreeT &DT, ContextT &Context) { if (!Cycle) return nullptr; // First try to expand Cycle to the largest that contains JoinBlock // but not DivTermBlock. const auto *Ext = getExtDivCycle(Cycle, DivTermBlock, JoinBlock); // Continue expanding to the largest cycle that contains both. const auto *Int = getIntDivCycle(Cycle, DivTermBlock, JoinBlock, DT, Context); if (Int) return Int; return Ext; } template bool GenericUniformityAnalysisImpl::isTemporalDivergent( const BlockT &ObservingBlock, const InstructionT &Def) const { const BlockT *DefBlock = Def.getParent(); for (const CycleT *Cycle = CI.getCycle(DefBlock); Cycle && !Cycle->contains(&ObservingBlock); Cycle = Cycle->getParentCycle()) { if (DivergentExitCycles.contains(Cycle)) { return true; } } return false; } template void GenericUniformityAnalysisImpl::analyzeControlDivergence( const InstructionT &Term) { const auto *DivTermBlock = Term.getParent(); DivergentTermBlocks.insert(DivTermBlock); LLVM_DEBUG(dbgs() << "analyzeControlDiv " << Context.print(DivTermBlock) << "\n"); // Don't propagate divergence from unreachable blocks. if (!DT.isReachableFromEntry(DivTermBlock)) return; const auto &DivDesc = SDA.getJoinBlocks(DivTermBlock); SmallVector DivCycles; // Iterate over all blocks now reachable by a disjoint path join for (const auto *JoinBlock : DivDesc.JoinDivBlocks) { const auto *Cycle = CI.getCycle(JoinBlock); LLVM_DEBUG(dbgs() << "visiting join block " << Context.print(JoinBlock) << "\n"); if (const auto *Outermost = getOutermostDivergentCycle( Cycle, DivTermBlock, JoinBlock, DT, Context)) { LLVM_DEBUG(dbgs() << "found divergent cycle\n"); DivCycles.push_back(Outermost); continue; } taintAndPushPhiNodes(*JoinBlock); } // Sort by order of decreasing depth. This allows later cycles to be skipped // because they are already contained in earlier ones. llvm::sort(DivCycles, [](const CycleT *A, const CycleT *B) { return A->getDepth() > B->getDepth(); }); // Cycles that are assumed divergent due to the diverged entry // criterion potentially contain temporal divergence depending on // the DFS chosen. Conservatively, all values produced in such a // cycle are assumed divergent. "Cycle invariant" values may be // assumed uniform, but that requires further analysis. for (auto *C : DivCycles) { if (!insertIfNotContained(AssumedDivergent, C)) continue; LLVM_DEBUG(dbgs() << "process divergent cycle\n"); for (const BlockT *BB : C->blocks()) { taintAndPushAllDefs(*BB); } } const auto *BranchCycle = CI.getCycle(DivTermBlock); assert(DivDesc.CycleDivBlocks.empty() || BranchCycle); for (const auto *DivExitBlock : DivDesc.CycleDivBlocks) { propagateCycleExitDivergence(*DivExitBlock, *BranchCycle); } } template void GenericUniformityAnalysisImpl::compute() { // Initialize worklist. auto DivValuesCopy = DivergentValues; for (const auto DivVal : DivValuesCopy) { assert(isDivergent(DivVal) && "Worklist invariant violated!"); pushUsers(DivVal); } // All values on the Worklist are divergent. // Their users may not have been updated yet. while (!Worklist.empty()) { const InstructionT *I = Worklist.back(); Worklist.pop_back(); LLVM_DEBUG(dbgs() << "worklist pop: " << Context.print(I) << "\n"); if (I->isTerminator()) { analyzeControlDivergence(*I); continue; } // propagate value divergence to users assert(isDivergent(*I) && "Worklist invariant violated!"); pushUsers(*I); } } template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { return UniformOverrides.contains(&Instr); } template GenericUniformityInfo::GenericUniformityInfo( const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) { DA.reset(new ImplT{DT, CI, TTI}); } template void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { bool haveDivergentArgs = false; // Control flow instructions may be divergent even if their inputs are // uniform. Thus, although exceedingly rare, it is possible to have a program // with no divergent values but with divergent control structures. if (DivergentValues.empty() && DivergentTermBlocks.empty() && DivergentExitCycles.empty()) { OS << "ALL VALUES UNIFORM\n"; return; } for (const auto &entry : DivergentValues) { const BlockT *parent = Context.getDefBlock(entry); if (!parent) { if (!haveDivergentArgs) { OS << "DIVERGENT ARGUMENTS:\n"; haveDivergentArgs = true; } OS << " DIVERGENT: " << Context.print(entry) << '\n'; } } if (!AssumedDivergent.empty()) { OS << "CYCLES ASSSUMED DIVERGENT:\n"; for (const CycleT *cycle : AssumedDivergent) { OS << " " << cycle->print(Context) << '\n'; } } if (!DivergentExitCycles.empty()) { OS << "CYCLES WITH DIVERGENT EXIT:\n"; for (const CycleT *cycle : DivergentExitCycles) { OS << " " << cycle->print(Context) << '\n'; } } for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; OS << "DEFINITIONS\n"; SmallVector defs; Context.appendBlockDefs(defs, block); for (auto value : defs) { if (isDivergent(value)) OS << " DIVERGENT: "; else OS << " "; OS << Context.print(value) << '\n'; } OS << "TERMINATORS\n"; SmallVector terms; Context.appendBlockTerms(terms, block); bool divergentTerminators = hasDivergentTerminator(block); for (auto *T : terms) { if (divergentTerminators) OS << " DIVERGENT: "; else OS << " "; OS << Context.print(T) << '\n'; } OS << "END BLOCK\n"; } } template bool GenericUniformityInfo::hasDivergence() const { return DA->hasDivergence(); } template const typename ContextT::FunctionT & GenericUniformityInfo::getFunction() const { return DA->getFunction(); } /// Whether \p V is divergent at its definition. template bool GenericUniformityInfo::isDivergent(ConstValueRefT V) const { return DA->isDivergent(V); } template bool GenericUniformityInfo::isDivergent(const InstructionT *I) const { return DA->isDivergent(*I); } template bool GenericUniformityInfo::isDivergentUse(const UseT &U) const { return DA->isDivergentUse(U); } template bool GenericUniformityInfo::hasDivergentTerminator(const BlockT &B) { return DA->hasDivergentTerminator(B); } /// \brief T helper function for printing. template void GenericUniformityInfo::print(raw_ostream &out) const { DA->print(out); } template void llvm::ModifiedPostOrder::computeStackPO( SmallVectorImpl &Stack, const CycleInfoT &CI, const CycleT *Cycle, SmallPtrSetImpl &Finalized) { LLVM_DEBUG(dbgs() << "inside computeStackPO\n"); while (!Stack.empty()) { auto *NextBB = Stack.back(); if (Finalized.count(NextBB)) { Stack.pop_back(); continue; } LLVM_DEBUG(dbgs() << " visiting " << CI.getSSAContext().print(NextBB) << "\n"); auto *NestedCycle = CI.getCycle(NextBB); if (Cycle != NestedCycle && (!Cycle || Cycle->contains(NestedCycle))) { LLVM_DEBUG(dbgs() << " found a cycle\n"); while (NestedCycle->getParentCycle() != Cycle) NestedCycle = NestedCycle->getParentCycle(); SmallVector NestedExits; NestedCycle->getExitBlocks(NestedExits); bool PushedNodes = false; for (auto *NestedExitBB : NestedExits) { LLVM_DEBUG(dbgs() << " examine exit: " << CI.getSSAContext().print(NestedExitBB) << "\n"); if (Cycle && !Cycle->contains(NestedExitBB)) continue; if (Finalized.count(NestedExitBB)) continue; PushedNodes = true; Stack.push_back(NestedExitBB); LLVM_DEBUG(dbgs() << " pushed exit: " << CI.getSSAContext().print(NestedExitBB) << "\n"); } if (!PushedNodes) { // All loop exits finalized -> finish this node Stack.pop_back(); computeCyclePO(CI, NestedCycle, Finalized); } continue; } LLVM_DEBUG(dbgs() << " no nested cycle, going into DAG\n"); // DAG-style bool PushedNodes = false; for (auto *SuccBB : successors(NextBB)) { LLVM_DEBUG(dbgs() << " examine succ: " << CI.getSSAContext().print(SuccBB) << "\n"); if (Cycle && !Cycle->contains(SuccBB)) continue; if (Finalized.count(SuccBB)) continue; PushedNodes = true; Stack.push_back(SuccBB); LLVM_DEBUG(dbgs() << " pushed succ: " << CI.getSSAContext().print(SuccBB) << "\n"); } if (!PushedNodes) { // Never push nodes twice LLVM_DEBUG(dbgs() << " finishing node: " << CI.getSSAContext().print(NextBB) << "\n"); Stack.pop_back(); Finalized.insert(NextBB); appendBlock(*NextBB); } } LLVM_DEBUG(dbgs() << "exited computeStackPO\n"); } template void ModifiedPostOrder::computeCyclePO( const CycleInfoT &CI, const CycleT *Cycle, SmallPtrSetImpl &Finalized) { LLVM_DEBUG(dbgs() << "inside computeCyclePO\n"); SmallVector Stack; auto *CycleHeader = Cycle->getHeader(); LLVM_DEBUG(dbgs() << " noted header: " << CI.getSSAContext().print(CycleHeader) << "\n"); assert(!Finalized.count(CycleHeader)); Finalized.insert(CycleHeader); // Visit the header last LLVM_DEBUG(dbgs() << " finishing header: " << CI.getSSAContext().print(CycleHeader) << "\n"); appendBlock(*CycleHeader, Cycle->isReducible()); // Initialize with immediate successors for (auto *BB : successors(CycleHeader)) { LLVM_DEBUG(dbgs() << " examine succ: " << CI.getSSAContext().print(BB) << "\n"); if (!Cycle->contains(BB)) continue; if (BB == CycleHeader) continue; if (!Finalized.count(BB)) { LLVM_DEBUG(dbgs() << " pushed succ: " << CI.getSSAContext().print(BB) << "\n"); Stack.push_back(BB); } } // Compute PO inside region computeStackPO(Stack, CI, Cycle, Finalized); LLVM_DEBUG(dbgs() << "exited computeCyclePO\n"); } /// \brief Generically compute the modified post order. template void llvm::ModifiedPostOrder::compute(const CycleInfoT &CI) { SmallPtrSet Finalized; SmallVector Stack; auto *F = CI.getFunction(); Stack.reserve(24); // FIXME made-up number Stack.push_back(&F->front()); computeStackPO(Stack, CI, nullptr, Finalized); } } // namespace llvm #undef DEBUG_TYPE #endif // LLVM_ADT_GENERICUNIFORMITYIMPL_H