// Unicode utilities -*- C++ -*-
// Copyright The GNU Toolchain Authors.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// .
/** @file include/bits/unicode.h
* This is an internal header file, included by other library headers.
* Do not attempt to use it directly. @headername{format}
*/
#ifndef _GLIBCXX_UNICODE_H
#define _GLIBCXX_UNICODE_H 1
#if __cplusplus >= 202002L
#include
#include // bit_width
#include // __detail::__from_chars_alnum_to_val_table
#include
#include
#include
#include
#include // iterator_t, sentinel_t, input_range, etc.
#include // view_interface
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
namespace __unicode
{
// A Unicode code point that is not a high or low surrogate.
constexpr bool
__is_scalar_value(char32_t __c)
{
if (__c < 0xD800) [[likely]]
return true;
return 0xDFFF < __c && __c <= 0x10FFFF;
}
// A code point that can be encoded in a single code unit of type _CharT.
template
constexpr bool
__is_single_code_unit(char32_t __c)
{
if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)
return __c < 0x7F; // ASCII character
else
return __c < __gnu_cxx::__int_traits<_CharT>::__max
&& __is_scalar_value(__c);
}
// Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
struct _Repl
{
constexpr char32_t
operator()() const noexcept
{ return 0xFFFD; }
};
struct _Null_sentinel_t
{
template
requires default_initializable>
&& equality_comparable_with, iter_value_t<_It>>
friend constexpr auto
operator==(_It __it, _Null_sentinel_t)
{ return *__it == iter_value_t<_It>{}; }
};
template _Sent = _Iter,
typename _ErrorHandler = _Repl>
requires convertible_to, _FromFmt>
class _Utf_iterator
{
static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));
public:
using value_type = _ToFmt;
using difference_type = iter_difference_t<_Iter>;
using reference = value_type;
using iterator_concept
= std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,
bidirectional_iterator_tag>;
constexpr _Utf_iterator() = default;
constexpr
_Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
requires bidirectional_iterator<_Iter>
: _M_first_and_curr{__first, __it}, _M_last(__last)
{
if (_M_curr() != _M_last)
_M_read();
else
_M_buf = {};
}
constexpr
_Utf_iterator(_Iter __it, _Sent __last)
requires (!bidirectional_iterator<_Iter>)
: _M_first_and_curr{__it}, _M_last(__last)
{
if (_M_curr() != _M_last)
_M_read();
else
_M_buf = {};
}
template
requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
constexpr
_Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
_ErrorHandler>& __other)
: _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
_M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
_M_last(__other._M_last)
{ }
[[nodiscard]]
constexpr _Iter
begin() const requires bidirectional_iterator<_Iter>
{ return _M_first(); }
[[nodiscard]]
constexpr _Sent
end() const { return _M_last; }
[[nodiscard]]
constexpr _Iter
base() const requires forward_iterator<_Iter>
{ return _M_curr(); }
[[nodiscard]]
constexpr value_type
operator*() const { return _M_buf[_M_buf_index]; }
constexpr _Utf_iterator&
operator++()
{
if (_M_buf_index + 1 == _M_buf_last && _M_curr() != _M_last)
{
if constexpr (forward_iterator<_Iter>)
std::advance(_M_curr(), _M_to_increment);
if (_M_curr() == _M_last)
_M_buf_index = 0;
else
_M_read();
}
else if (_M_buf_index + 1 < _M_buf_last)
++_M_buf_index;
return *this;
}
constexpr _Utf_iterator
operator++(int)
{
auto __tmp = *this;
++*this;
return __tmp;
}
constexpr _Utf_iterator&
operator--() requires bidirectional_iterator<_Iter>
{
if (!_M_buf_index && _M_curr() != _M_first())
_M_read_reverse();
else if (_M_buf_index)
--_M_buf_index;
return *this;
}
constexpr _Utf_iterator
operator--(int)
{
auto __tmp = *this;
--*this;
return __tmp;
}
[[nodiscard]]
friend constexpr bool
operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }
{
if constexpr (forward_iterator<_Iter>)
return __lhs._M_curr() == __rhs._M_curr()
&& __lhs._M_buf_index == __rhs._M_buf_index;
else if (__lhs._M_curr() != __rhs._M_curr())
return false;
else if (__lhs._M_buf_index == __rhs._M_buf_index
&& __lhs._M_buf_last == __rhs._M_buf_last)
return true;
else
return __lhs._M_buf_index == __lhs._M_buf_last
&& __rhs._M_buf_index == __rhs._M_buf_last;
}
[[nodiscard]]
friend constexpr bool
operator==(_Utf_iterator __lhs, _Sent __rhs)
{
if constexpr (forward_iterator<_Iter>)
return __lhs._M_curr() == __rhs;
else
return __lhs._M_curr() == __rhs
&& __lhs._M_buf_index == __lhs._M_buf_last;
}
private:
constexpr void
_M_read()
{
if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
_M_read_utf8();
else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
_M_read_utf16();
else
{
static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
_M_read_utf32();
}
}
constexpr void
_M_read_reverse(); // TODO
template
struct _Guard
{
_Guard(void*, _Iter&) { }
};
template requires forward_iterator<_It>
struct _Guard<_It>
{
constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
_Utf_iterator* _M_this;
_It _M_orig;
};
constexpr void
_M_read_utf8()
{
_Guard<_Iter> __g{this, _M_curr()};
char32_t __c{};
const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
uint8_t __u = *_M_curr()++;
uint8_t __to_incr = 1;
auto __incr = [&, this] {
++__to_incr;
return ++_M_curr();
};
if (__u <= 0x7F) [[likely]] // 0x00 to 0x7F
__c = __u;
else if (__u < 0xC2) [[unlikely]]
__c = _S_error();
else if (_M_curr() == _M_last) [[unlikely]]
__c = _S_error();
else if (__u <= 0xDF) // 0xC2 to 0xDF
{
__c = __u & 0x1F;
__u = *_M_curr();
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
__c = _S_error();
else
{
__c = (__c << 6) | (__u & 0x3F);
__incr();
}
}
else if (__u <= 0xEF) // 0xE0 to 0xEF
{
const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;
const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;
__c = __u & 0x0F;
__u = *_M_curr();
if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
__c = _S_error();
else if (__incr() == _M_last) [[unlikely]]
__c = _S_error();
else
{
__c = (__c << 6) | (__u & 0x3F);
__u = *_M_curr();
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
__c = _S_error();
else
{
__c = (__c << 6) | (__u & 0x3F);
__incr();
}
}
}
else if (__u <= 0xF4) // 0xF0 to 0xF4
{
const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;
const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;
__c = __u & 0x07;
__u = *_M_curr();
if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
__c = _S_error();
else if (__incr() == _M_last) [[unlikely]]
__c = _S_error();
else
{
__c = (__c << 6) | (__u & 0x3F);
__u = *_M_curr();
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
__c = _S_error();
else if (__incr() == _M_last) [[unlikely]]
__c = _S_error();
else
{
__c = (__c << 6) | (__u & 0x3F);
__u = *_M_curr();
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
__c = _S_error();
else
{
__c = (__c << 6) | (__u & 0x3F);
__incr();
}
}
}
}
else [[unlikely]]
__c = _S_error();
_M_update(__c, __to_incr);
}
constexpr void
_M_read_utf16()
{
_Guard<_Iter> __g{this, _M_curr()};
char32_t __c{};
uint16_t __u = *_M_curr()++;
uint8_t __to_incr = 1;
if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
__c = __u;
else if (__u < 0xDC00 && _M_curr() != _M_last)
{
uint16_t __u2 = *_M_curr();
if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]
__c = _S_error();
else
{
++_M_curr();
__to_incr = 2;
uint32_t __x = (__u & 0x3F) << 10 | __u2 & 0x3FF;
uint32_t __w = (__u >> 6) & 0x1F;
__c = (__w + 1) << 16 | __x;
}
}
else
__c = _S_error();
_M_update(__c, __to_incr);
}
constexpr void
_M_read_utf32()
{
_Guard<_Iter> __g{this, _M_curr()};
char32_t __c = *_M_curr()++;
if (!__is_scalar_value(__c)) [[unlikely]]
__c = _S_error();
_M_update(__c, 1);
}
// Encode the code point __c as one or more code units in _M_buf.
constexpr void
_M_update(char32_t __c, uint8_t __to_incr)
{
_M_to_increment = __to_incr;
_M_buf_index = 0;
if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
{
_M_buf[0] = __c;
_M_buf_last = 1;
}
else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
{
if (__is_single_code_unit<_ToFmt>(__c))
{
_M_buf[0] = __c;
_M_buf[1] = 0;
_M_buf_last = 1;
}
else
{
// From http://www.unicode.org/faq/utf_bom.html#utf16-4
const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);
char16_t __lead = __lead_offset + (__c >> 10);
char16_t __trail = 0xDC00 + (__c & 0x3FF);
_M_buf[0] = __lead;
_M_buf[1] = __trail;
_M_buf_last = 2;
}
}
else
{
static_assert(sizeof(_ToFmt) == 1);
int __bits = std::bit_width((uint32_t)__c);
if (__bits <= 7) [[likely]]
{
_M_buf[0] = __c;
_M_buf[1] = _M_buf[2] = _M_buf[3] = 0;
_M_buf_last = 1;
}
else if (__bits <= 11)
{
_M_buf[0] = 0xC0 | (__c >> 6);
_M_buf[1] = 0x80 | (__c & 0x3F);
_M_buf[2] = _M_buf[3] = 0;
_M_buf_last = 2;
}
else if (__bits <= 16)
{
_M_buf[0] = 0xE0 | (__c >> 12);
_M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);
_M_buf[2] = 0x80 | (__c & 0x3F);
_M_buf[3] = 0;
_M_buf_last = 3;
}
else
{
_M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);
_M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);
_M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);
_M_buf[3] = 0x80 | (__c & 0x3F);
_M_buf_last = 4;
}
}
}
constexpr char32_t
_S_error()
{
char32_t __c = _ErrorHandler()();
__glibcxx_assert(__is_scalar_value(__c));
return __c;
}
constexpr _Iter
_M_first() const requires bidirectional_iterator<_Iter>
{ return _M_first_and_curr._M_first; }
constexpr _Iter&
_M_curr() { return _M_first_and_curr._M_curr; }
constexpr _Iter
_M_curr() const { return _M_first_and_curr._M_curr; }
array _M_buf;
template
struct _First_and_curr
{
_First_and_curr() = default;
constexpr
_First_and_curr(_It __curr) : _M_curr(__curr) { }
template _It2>
constexpr
_First_and_curr(const _First_and_curr<_It2>& __other)
: _M_curr(__other._M_curr) { }
_It _M_curr;
};
template requires bidirectional_iterator<_It>
struct _First_and_curr<_It>
{
_First_and_curr() = default;
constexpr
_First_and_curr(_It __first, _It __curr)
: _M_first(__first), _M_curr(__curr) { }
template _It2>
constexpr
_First_and_curr(const _First_and_curr<_It2>& __other)
: _M_first(__other._M_first), _M_curr(__other._M_curr) { }
_It _M_first;
_It _M_curr;
};
_First_and_curr<_Iter> _M_first_and_curr;
uint8_t _M_buf_index = 0;
uint8_t _M_buf_last = 0;
uint8_t _M_to_increment = 0;
[[no_unique_address]] _Sent _M_last;
template _Sent2,
typename _ErrHandler>
requires convertible_to, _FromFmt2>
friend class _Utf_iterator;
};
template
class _Utf_view
: public ranges::view_interface<_Utf_view<_ToFormat, _Range>>
{
using _Iterator = _Utf_iterator,
_ToFormat, ranges::iterator_t<_Range>,
ranges::sentinel_t<_Range>>;
template
constexpr auto
_M_begin(_Iter __first, _Sent __last)
{
if constexpr (bidirectional_iterator<_Iter>)
return _Iterator(__first, __first, __last);
else
return _Iterator(__first, __last);
}
template
constexpr auto
_M_end(_Iter __first, _Sent __last)
{
if constexpr (!is_same_v<_Iter, _Sent>)
return __last;
else if constexpr (bidirectional_iterator<_Iter>)
return _Iterator(__first, __last, __last);
else
return _Iterator(__last, __last);
}
_Range _M_base;
public:
constexpr explicit
_Utf_view(_Range&& __r) : _M_base(std::forward<_Range>(__r)) { }
constexpr auto begin()
{ return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }
constexpr auto end()
{ return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }
constexpr bool empty() const { return ranges::empty(_M_base); }
};
#ifdef __cpp_char8_t
template
using _Utf8_view = _Utf_view;
#else
template
using _Utf8_view = _Utf_view;
#endif
template
using _Utf16_view = _Utf_view;
template
using _Utf32_view = _Utf_view;
inline namespace __v15_1_0
{
#define _GLIBCXX_GET_UNICODE_DATA 150100
#include "unicode-data.h"
#ifdef _GLIBCXX_GET_UNICODE_DATA
# error "Invalid unicode data"
#endif
// The field width of a code point.
constexpr int
__field_width(char32_t __c) noexcept
{
if (__c < __width_edges[0]) [[likely]]
return 1;
auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
return (__p - __width_edges) % 2 + 1;
}
// @pre c <= 0x10FFFF
constexpr _Gcb_property
__grapheme_cluster_break_property(char32_t __c) noexcept
{
constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
auto* __end = std::end(__gcb_edges);
auto* __p = std::lower_bound(__gcb_edges, __end,
(__c << __gcb_shift_bits) | __mask);
return _Gcb_property(__p[-1] & __mask);
}
constexpr bool
__is_incb_linker(char32_t __c) noexcept
{
const auto __end = std::end(__incb_linkers);
// Array is small enough that linear search is faster than binary search.
return std::find(__incb_linkers, __end, __c) != __end;
}
// @pre c <= 0x10FFFF
constexpr _InCB
__incb_property(char32_t __c) noexcept
{
if ((__c << 2) < __incb_edges[0]) [[likely]]
return _InCB(0);
constexpr uint32_t __mask = 0x3;
auto* __end = std::end(__incb_edges);
auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
return _InCB(__p[-1] & __mask);
}
constexpr bool
__is_extended_pictographic(char32_t __c)
{
if (__c < __xpicto_edges[0]) [[likely]]
return 0;
auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
return (__p - __xpicto_edges) % 2;
}
struct _Grapheme_cluster_iterator_base
{
char32_t _M_c; // First code point in the cluster.
_Gcb_property _M_prop; // GCB property of _M_c.
enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
_XPicto _M_xpicto_seq_state = _XPicto::_Init;
unsigned char _M_RI_count = 0;
bool _M_incb_linker_seen = false;
constexpr void
_M_reset(char32_t __c, _Gcb_property __p)
{
_M_c = __c;
_M_prop = __p;
_M_xpicto_seq_state = _XPicto::_Init;
_M_RI_count = 0;
_M_incb_linker_seen = false;
}
constexpr void
_M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
{
if (_M_xpicto_seq_state == _XPicto::_Failed)
return;
auto __next_state = _XPicto::_Failed;
if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
{
if (__p == _Gcb_property::_Gcb_ZWJ)
{
if (_M_xpicto_seq_state == _XPicto::_Matched)
__next_state = _XPicto::_Zwj;
// We check _M_c here so that we do the lookup at most once,
// and only for clusters containing at least one ZWJ.
else if (__is_extended_pictographic(_M_c))
__next_state = _XPicto::_Zwj;
}
else if (__p == _Gcb_property::_Gcb_Extend)
__next_state = _M_xpicto_seq_state; // no change
}
else // Zwj
{
// This assumes that all \p{Extended_Pictographic} emoji have
// Grapheme_Cluster_Break=Other.
if (__p == _Gcb_property::_Gcb_Other
&& __is_extended_pictographic(__c))
__next_state = _XPicto::_Matched;
}
_M_xpicto_seq_state = __next_state;
}
constexpr void
_M_update_ri_count(_Gcb_property __p)
{
if (__p == _Gcb_property::_Gcb_Regional_Indicator)
++_M_RI_count;
else
_M_RI_count = 0;
}
constexpr void
_M_update_incb_state(char32_t __c, _Gcb_property)
{
if (__is_incb_linker(__c))
_M_incb_linker_seen = true;
}
};
// Split a range into extended grapheme clusters.
template requires ranges::view<_View>
class _Grapheme_cluster_view
: public ranges::view_interface<_Grapheme_cluster_view<_View>>
{
public:
constexpr
_Grapheme_cluster_view(_View __v)
: _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
{ }
constexpr auto begin() const { return _M_begin; }
constexpr auto end() const { return _M_begin.end(); }
private:
struct _Iterator : private _Grapheme_cluster_iterator_base
{
private:
// Iterator over the underlying code points.
using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;
public:
// TODO: Change value_type to be subrange<_U32_iterator> instead?
// Alternatively, value_type could be _Utf32_view>.
// That would be the whole cluster, not just the first code point.
// Would need to store two iterators and find end of current cluster
// on increment, so operator* returns value_type(_M_base, _M_next).
using value_type = char32_t;
using iterator_concept = forward_iterator_tag;
using difference_type = ptrdiff_t;
constexpr
_Iterator(_U32_iterator __i)
: _M_base(__i)
{
if (__i != __i.end())
{
_M_c = *__i;
_M_prop = __grapheme_cluster_break_property(_M_c);
}
}
// The first code point of the current extended grapheme cluster.
constexpr value_type
operator*() const
{ return _M_c; }
constexpr auto
operator->() const
{ return &_M_c; }
// Move to the next extended grapheme cluster.
constexpr _Iterator&
operator++()
{
const auto __end = _M_base.end();
if (_M_base != __end)
{
auto __p_prev = _M_prop;
auto __it = _M_base;
while (++__it != __end)
{
char32_t __c = *__it;
auto __p = __grapheme_cluster_break_property(*__it);
_M_update_xpicto_seq_state(__c, __p);
_M_update_ri_count(__p);
_M_update_incb_state(__c, __p);
if (_M_is_break(__p_prev, __p, __it))
{
// Found a grapheme cluster break
_M_reset(__c, __p);
break;
}
__p_prev = __p;
}
_M_base = __it;
}
return *this;
}
constexpr _Iterator
operator++(int)
{
auto __tmp = *this;
++*this;
return __tmp;
}
constexpr bool
operator==(const _Iterator& __i) const
{ return _M_base == __i._M_base; }
// This supports iter != iter.end()
constexpr bool
operator==(const ranges::sentinel_t<_View>& __i) const
{ return _M_base == __i; }
// Iterator to the start of the current cluster.
constexpr auto base() const { return _M_base.base(); }
// The end of the underlying view (not the end of the current cluster!)
constexpr auto end() const { return _M_base.end(); }
// Field width of the first code point in the cluster.
constexpr int
width() const noexcept
{ return __field_width(_M_c); }
private:
_U32_iterator _M_base;
// Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
// http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
// This implements the rules from TR29 revision 43 in Unicode 15.1.0.
// Return true if there is a break between code point with property p1
// and code point with property p2.
constexpr bool
_M_is_break(_Gcb_property __p1, _Gcb_property __p2,
_U32_iterator __curr) const
{
using enum _Gcb_property;
if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
return true; // Break after Control or LF.
if (__p1 == _Gcb_CR)
return __p2 != _Gcb_LF; // Do not break between a CR and LF.
// Rule GB5
if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
return true; // Break before Control, CR or LF.
// Rule GB6
if (__p1 == _Gcb_L)
switch (__p2)
{
case _Gcb_L:
case _Gcb_V:
case _Gcb_LV:
case _Gcb_LVT:
return false; // Do not break Hangul syllable sequences.
default:
return true;
}
// Rule GB7
if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
switch (__p2)
{
case _Gcb_V:
case _Gcb_T:
return false; // Do not break Hangul syllable sequences.
default:
return true;
}
// Rule GB8
if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.
// Rule GB9
if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
return false; // Do not break before extending characters or ZWJ.
// The following GB9x rules only apply to extended grapheme clusters,
// which is what the C++ standard uses (not legacy grapheme clusters).
// Rule GB9a
if (__p2 == _Gcb_SpacingMark)
return false; // Do not break before SpacingMarks,
// Rule GB9b
if (__p1 == _Gcb_Prepend)
return false; // or after Prepend characters.
// Rule GB9c (Unicode 15.1.0)
// Do not break within certain combinations with
// Indic_Conjunct_Break (InCB)=Linker.
if (_M_incb_linker_seen
&& __incb_property(_M_c) == _InCB::_Consonant
&& __incb_property(*__curr) == _InCB::_Consonant)
{
// Match [_M_base, __curr] against regular expression
// Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
bool __have_linker = false;
auto __it = _M_base;
while (++__it != __curr)
{
if (__is_incb_linker(*__it))
__have_linker = true;
else
{
auto __incb = __incb_property(*__it);
if (__incb == _InCB::_Consonant)
__have_linker = false;
else if (__incb != _InCB::_Extend)
break;
}
}
if (__it == __curr && __have_linker)
return false;
}
// Rule GB11
// Do not break within emoji modifier sequences
// or emoji zwj sequences.
if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
return false;
// Rules GB12 and GB13
// Do not break within emoji flag sequences. That is, do not break
// between regional indicator (RI) symbols if there is an odd number
// of RI characters before the break point.
if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
return (_M_RI_count & 1) == 0;
// Rule GB999
return true; // Otherwise, break everywhere.
}
};
_Iterator _M_begin;
};
} // namespace __v15_1_0
// Return the field width of a string.
template
constexpr size_t
__field_width(basic_string_view<_CharT> __s)
{
if (__s.empty()) [[unlikely]]
return 0;
_Grapheme_cluster_view> __gc(__s);
auto __it = __gc.begin();
const auto __end = __gc.end();
size_t __n = __it.width();
while (++__it != __end)
__n += __it.width();
return __n;
}
// Truncate a string to at most `__max` field width units, and return the
// resulting field width.
template
constexpr size_t
__truncate(basic_string_view<_CharT>& __s, size_t __max)
{
if (__s.empty()) [[unlikely]]
return 0;
_Grapheme_cluster_view> __gc(__s);
auto __it = __gc.begin();
const auto __end = __gc.end();
size_t __n = __it.width();
if (__n > __max)
{
__s = {};
return 0;
}
while (++__it != __end)
{
size_t __n2 = __n + __it.width();
if (__n2 > __max)
{
__s = basic_string_view<_CharT>(__s.begin(), __it.base());
return __n;
}
__n = __n2;
}
return __n;
}
template
consteval bool
__literal_encoding_is_unicode()
{
if constexpr (is_same_v<_CharT, char16_t>)
return true;
else if constexpr (is_same_v<_CharT, char32_t>)
return true;
#ifdef __cpp_char8_t
else if constexpr (is_same_v<_CharT, char8_t>)
return true;
#endif
const char* __enc = "";
#ifdef __GNUC_EXECUTION_CHARSET_NAME
auto __remove_iso10646_prefix = [](const char* __s) {
// GNU iconv allows "ISO-10646/" prefix (case-insensitive).
if (__s[0] == 'I' || __s[0] == 'i')
if (__s[1] == 'S' || __s[1] == 's')
if (__s[2] == 'O' || __s[2] == 'o')
if (string_view(__s + 3).starts_with("-10646/"))
return __s + 10;
return __s;
};
if constexpr (is_same_v<_CharT, char>)
__enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
# if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
else
__enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
# endif
if ((__enc[0] == 'U' || __enc[0] == 'u')
&& (__enc[1] == 'T' || __enc[1] == 't')
&& (__enc[2] == 'F' || __enc[2] == 'f'))
{
__enc += 3;
if (__enc[0] == '-')
++__enc;
if (__enc[0] == '8')
return __enc[1] == '\0' || string_view(__enc + 1) == "//";
else if constexpr (!is_same_v<_CharT, char>)
{
string_view __s(__enc);
if (__s.ends_with("//"))
__s.remove_suffix(2);
return __s == "16" || __s == "32";
}
}
#elif defined __clang_literal_encoding__
if constexpr (is_same_v<_CharT, char>)
__enc = __clang_literal_encoding__;
# if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
else
__enc = __clang_wide_literal_encoding__;
# endif
// Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
string_view __s(__enc);
if (__s == "UTF-8")
return true;
else if constexpr (!is_same_v<_CharT, char>)
return __s == "UTF-16" || __s == "UTF-32";
#endif
return false;
}
consteval bool
__literal_encoding_is_utf8()
{ return __literal_encoding_is_unicode(); }
consteval bool
__literal_encoding_is_extended_ascii()
{
return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
&& 'a' == 0x61 && 'z' == 0x7a;
}
// https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
constexpr bool
__charset_alias_match(string_view __a, string_view __b)
{
// Map alphanumeric chars to their base 64 value, everything else to 127.
auto __map = [](char __c, bool& __num) -> unsigned char {
if (__c == '0') [[unlikely]]
return __num ? 0 : 127;
const auto __v = __detail::__from_chars_alnum_to_val(__c);
__num = __v < 10;
return __v;
};
auto __ptr_a = __a.begin(), __end_a = __a.end();
auto __ptr_b = __b.begin(), __end_b = __b.end();
bool __num_a = false, __num_b = false;
while (true)
{
// Find the value of the next alphanumeric character in each string.
unsigned char __val_a{}, __val_b{};
while (__ptr_a != __end_a
&& (__val_a = __map(*__ptr_a, __num_a)) == 127)
++__ptr_a;
while (__ptr_b != __end_b
&& (__val_b = __map(*__ptr_b, __num_b)) == 127)
++__ptr_b;
// Stop when we reach the end of a string, or get a mismatch.
if (__ptr_a == __end_a)
return __ptr_b == __end_b;
else if (__ptr_b == __end_b)
return false;
else if (__val_a != __val_b)
return false; // Found non-matching characters.
++__ptr_a;
++__ptr_b;
}
return true;
}
} // namespace __unicode
namespace ranges
{
template
inline constexpr bool
enable_borrowed_range>
= enable_borrowed_range<_Range>;
template
inline constexpr bool
enable_borrowed_range>
= enable_borrowed_range<_Range>;
} // namespace ranges
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace std
#endif // C++20
#endif // _GLIBCXX_UNICODE_H