//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines nodes for use within a SuffixTree. // // Each node has either no children or at least two children, with the root // being a exception in the empty tree. // // Children are represented as a map between unsigned integers and nodes. If // a node N has a child M on unsigned integer k, then the mapping represented // by N is a proper prefix of the mapping represented by M. Note that this, // although similar to a trie is somewhat different: each node stores a full // substring of the full mapping rather than a single character state. // // Each internal node contains a pointer to the internal node representing // the same string, but with the first character chopped off. This is stored // in \p Link. Each leaf node stores the start index of its respective // suffix in \p SuffixIdx. //===----------------------------------------------------------------------===// #ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H #define LLVM_SUPPORT_SUFFIXTREE_NODE_H #include "llvm/ADT/DenseMap.h" namespace llvm { /// A node in a suffix tree which represents a substring or suffix. struct SuffixTreeNode { public: /// Represents an undefined index in the suffix tree. static const unsigned EmptyIdx = -1; enum class NodeKind { ST_Leaf, ST_Internal }; private: const NodeKind Kind; /// The start index of this node's substring in the main string. unsigned StartIdx = EmptyIdx; /// The length of the string formed by concatenating the edge labels from /// the root to this node. unsigned ConcatLen = 0; public: // LLVM RTTI boilerplate. NodeKind getKind() const { return Kind; } /// \return the start index of this node's substring in the entire string. unsigned getStartIdx() const; /// \returns the end index of this node. virtual unsigned getEndIdx() const = 0; /// Advance this node's StartIdx by \p Inc. void incrementStartIdx(unsigned Inc); /// Set the length of the string from the root to this node to \p Len. void setConcatLen(unsigned Len); /// \returns the length of the string from the root to this node. unsigned getConcatLen() const; SuffixTreeNode(NodeKind Kind, unsigned StartIdx) : Kind(Kind), StartIdx(StartIdx) {} virtual ~SuffixTreeNode() = default; }; // A node with two or more children, or the root. struct SuffixTreeInternalNode : SuffixTreeNode { private: /// The end index of this node's substring in the main string. /// /// Every leaf node must have its \p EndIdx incremented at the end of every /// step in the construction algorithm. To avoid having to update O(N) /// nodes individually at the end of every step, the end index is stored /// as a pointer. unsigned EndIdx = EmptyIdx; /// A pointer to the internal node representing the same sequence with the /// first character chopped off. /// /// This acts as a shortcut in Ukkonen's algorithm. One of the things that /// Ukkonen's algorithm does to achieve linear-time construction is /// keep track of which node the next insert should be at. This makes each /// insert O(1), and there are a total of O(N) inserts. The suffix link /// helps with inserting children of internal nodes. /// /// Say we add a child to an internal node with associated mapping S. The /// next insertion must be at the node representing S - its first character. /// This is given by the way that we iteratively build the tree in Ukkonen's /// algorithm. The main idea is to look at the suffixes of each prefix in the /// string, starting with the longest suffix of the prefix, and ending with /// the shortest. Therefore, if we keep pointers between such nodes, we can /// move to the next insertion point in O(1) time. If we don't, then we'd /// have to query from the root, which takes O(N) time. This would make the /// construction algorithm O(N^2) rather than O(N). SuffixTreeInternalNode *Link = nullptr; public: // LLVM RTTI boilerplate. static bool classof(const SuffixTreeNode *N) { return N->getKind() == NodeKind::ST_Internal; } /// \returns true if this node is the root of its owning \p SuffixTree. bool isRoot() const; /// \returns the end index of this node's substring in the entire string. unsigned getEndIdx() const override; /// Sets \p Link to \p L. Assumes \p L is not null. void setLink(SuffixTreeInternalNode *L); /// \returns the pointer to the Link node. SuffixTreeInternalNode *getLink() const; /// The children of this node. /// /// A child existing on an unsigned integer implies that from the mapping /// represented by the current node, there is a way to reach another /// mapping by tacking that character on the end of the current string. DenseMap Children; SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx, SuffixTreeInternalNode *Link) : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx), Link(Link) {} virtual ~SuffixTreeInternalNode() = default; }; // A node representing a suffix. struct SuffixTreeLeafNode : SuffixTreeNode { private: /// The start index of the suffix represented by this leaf. unsigned SuffixIdx = EmptyIdx; /// The end index of this node's substring in the main string. /// /// Every leaf node must have its \p EndIdx incremented at the end of every /// step in the construction algorithm. To avoid having to update O(N) /// nodes individually at the end of every step, the end index is stored /// as a pointer. unsigned *EndIdx = nullptr; public: // LLVM RTTI boilerplate. static bool classof(const SuffixTreeNode *N) { return N->getKind() == NodeKind::ST_Leaf; } /// \returns the end index of this node's substring in the entire string. unsigned getEndIdx() const override; /// \returns the start index of the suffix represented by this leaf. unsigned getSuffixIdx() const; /// Sets the start index of the suffix represented by this leaf to \p Idx. void setSuffixIdx(unsigned Idx); SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx) : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {} virtual ~SuffixTreeLeafNode() = default; }; } // namespace llvm #endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H