"""Utilities for working with strings.
.. versionadded:: 0.2.0
"""
import configparser
import math
import re
import string
import unicodedata
from collections import namedtuple
from datetime import timedelta
from functools import lru_cache
from wcwidth import wcswidth
from .utils import check_type
__all__ = ['BINARY_PREFIXES', 'BOOLEAN_STATES', 'DECIMAL_PREFIXES', 'NO_PREFIX',
'Prefix', 'TranslationTable', 'bin_prefix', 'dec_prefix',
'find_bin_prefix', 'find_dec_prefix', 'format_bin_prefix',
'format_dec_prefix', 'format_timedelta', 'insert_separator',
'int2str', 'is_hexdigit', 'parse_timedelta', 'purge', 'shorten',
'slugify', 'split_host_port', 'str2bool', 'str2port', 'str2tuple',
'walign', 'wlen', 'wshorten']
BOOLEAN_STATES = configparser.ConfigParser.BOOLEAN_STATES.copy()
"""Dictionary with mappings from strings to boolean values.
Used by the function :func:`str2bool`.
This dictionary can be modified. The default values are:
* ``True``: ``'1', 'yes', 'true', 'on'``
* ``False``: ``'0', 'no', 'false', 'off'``
These are the same as in :attr:`configparser.ConfigParser.BOOLEAN_STATES`.
"""
[docs]def str2bool(s):
"""Convert a string to a boolean value.
The string is converted to lowercase before looked up in
the :data:`BOOLEAN_STATES` dictionary.
:param str s: the string
:return: a boolean value
:rtype: bool
:raises ValueError: if the string is not in :data:`BOOLEAN_STATES`
"""
try:
return BOOLEAN_STATES[s.lower()]
except KeyError:
raise ValueError('not a boolean: %s' % s)
[docs]def str2tuple(s, sep=',', converter=None, *, maxsplit=-1):
"""Convert a string to a tuple.
If ``converter`` is given and not ``None``, it must be a callable that
takes a string parameter and returns an object of the required type,
or else a tuple with string elements will be returned.
>>> str2tuple('1, 2, 3,4', converter=int)
(1, 2, 3, 4)
>>> str2tuple('on, off, no, true, YES')
('on', 'off', 'no', 'true', 'YES')
>>> str2tuple('on, off, no, true, YES', converter=str2bool)
(True, False, False, True, True)
>>> str2tuple('a, b, , d')
('a', 'b', '', 'd')
:param str s: the string
:param str sep: the separator (whitespace around ``sep`` will be ignored)
:param converter: the converter function
:type converter: callable(str)
:param int maxsplit: max. number of splits (-1 means no limit)
:return: tuple with elements of the required type
:rtype: tuple
.. versionchanged:: 0.14.0 Add parameter ``maxsplit``
"""
if s:
f = converter or str
return tuple(f(x.strip()) for x in s.split(sep, maxsplit))
return ()
[docs]def str2port(s):
"""Convert a string to a network port number.
:param str s: the string to convert
:return: port number
:rtype: int
:raises ValueError: if ``s`` cannot be converted to a number in [0..65535]
.. versionadded:: 0.5.0
"""
port = int(s)
if 0 <= port <= 65535:
return port
raise ValueError('not a valid port number: %d' % port)
[docs]def split_host_port(s, port=None):
"""Split a string into host and port.
>>> split_host_port('example.com:42', 21)
('example.com', 42)
>>> split_host_port('example.com', 21)
('example.com', 21)
:param str s: the string to split
:param int port: port that is used if there is none in the string
:return: host and port number
:rtype: str, int
:raises ValueError: if ``port`` or the port number in ``s``
are not in [0..65535] or neither of them is given
.. versionadded:: 0.5.0
"""
if port is not None and not (0 <= port <= 65535):
raise ValueError('not a valid port number: %d' % port)
a = s.split(':', 1)
if len(a) == 1:
if port is None:
raise ValueError('no port in parameter and default port is None')
return a[0], port
else:
return a[0], str2port(a[1])
[docs]def insert_separator(s, sep, group_size, reverse=False):
"""Insert separators in a string.
>>> insert_separator('008041aefd7e', ':', 2)
00:80:41:ae:fd:7e
>>> insert_separator('aaabbbcccd', ':', 3)
'aaa:bbb:ccc:d'
>>> insert_separator('aaabbbcccd', ':', 3, True)
'a:aab:bbc:ccd'
>>> insert_separator('9783161484100', '-', (3, 1, 2, 6))
'978-3-16-148410-0'
:param str s: the string
:param str sep: the separator character(s)
:param group_size: the size of each group
:type group_size: int or sequence of ints
:param bool reverse: if ``True`` group from right to left instead from
left to right
:return: string with separators
:rtype: str
:raises ValueError: if ``group_size < 1``
.. versionadded:: 0.5.0
.. versionchanged:: 0.6.0
Add parameter ``reverse``
.. versionchanged:: 0.9.0
Groups can be of different sizes
"""
if reverse:
s = s[::-1]
if isinstance(group_size, int):
if group_size < 1:
raise ValueError('group_size must be >= 1')
r = [s[i:i + group_size] for i in range(0, len(s), group_size)]
else:
if not all(map(lambda x: x > 0, group_size)):
raise ValueError('group_size must be >= 1')
r = []
idx = 0
for size in group_size:
x = s[idx:idx + size]
if x:
r.append(x)
else:
break
idx += size
else:
if idx < len(s):
r.append(s[idx:])
if reverse:
return sep.join(r)[::-1]
else:
return sep.join(r)
[docs]def purge(s, chars=None, negate=False):
"""Purge characters from a string.
Each character in ``chars`` will be eliminated from the string.
If ``chars=None`` or ``chars=''`` all consecutive whitespace
are replaced by a single space.
if ``negate=True`` all characters **not** in chars will be
purged (only applies when ``chars`` is at least one character)
>>> purge('00:80:41:ae:fd:7e', ':')
008041aefd7e
:param str s: the string
:param str chars: the characters
:param bool negate: see above
:return: the purged string
:rtype: str
.. versionadded:: 0.5.0
"""
if chars:
if negate:
return ''.join(c for c in s if c in chars)
return ''.join(c for c in s if c not in chars)
return re.sub(r'\s+', ' ', s)
[docs]def shorten(text, width=80, placeholder='…', pos='right'):
"""Shorten the text to fit in the given width.
If ``len(text) <= width`` the ``text`` is returned unchanged.
>>> text = 'Lorem ipsum dolor sit amet'
>>> shorten(text, width=15)
'Lorem ipsum do…'
>>> shorten(text, width=15, placeholder=' ... ', pos='middle')
'Lorem ... amet'
>>> shorten(text, width=15, pos='left')
'…dolor sit amet'
:param str text: the text
:param int width: the width
:param str placeholder: the placeholder
:param str pos: position (``'left', 'middle', 'right'``) of
placeholder in text
:return: the shortened text
:rtype: str
:raises ValueError: if ``width < len(placeholder)`` or ``pos`` is unknown
"""
if width < len(placeholder):
raise ValueError('width must be >= len(placeholder)')
if pos not in ('left', 'right', 'middle'):
raise ValueError('unknown pos: %s' % pos)
if len(text) <= width:
return text
if pos == 'right':
return text[:width - len(placeholder)] + placeholder
elif pos == 'left':
return placeholder + text[len(text) - width + len(placeholder):]
else:
p1 = math.ceil((width - len(placeholder)) / 2)
p2 = width - p1 - len(placeholder)
return text[:p1] + placeholder + (text[-p2:] if p2 else '')
Prefix = namedtuple('Prefix', 'name, symbol, factor')
Prefix.__doc__ = """\
Class for prefixes with fields ``name, symbol, factor``.
Used by the ``*_prefix()`` functions.
"""
NO_PREFIX = Prefix('', '', 1) #: No prefix
BINARY_PREFIXES = [
Prefix('yobi', 'Yi', 2**80),
Prefix('zebi', 'Zi', 2**70),
Prefix('exbi', 'Ei', 2**60),
Prefix('pebi', 'Pi', 2**50),
Prefix('tebi', 'Ti', 2**40),
Prefix('gibi', 'Gi', 2**30),
Prefix('mebi', 'Mi', 2**20),
Prefix('kibi', 'Ki', 2**10),
NO_PREFIX
]
r"""List with binary prefixes.
The entries in this list are of type :class:`Prefix`.
===== ====== ======
Name Symbol Factor
===== ====== ======
yobi Yi 2\ :sup:`80`
zebi Zi 2\ :sup:`70`
exbi Ei 2\ :sup:`60`
pebi Pi 2\ :sup:`50`
tebi Ti 2\ :sup:`40`
gibi Gi 2\ :sup:`30`
mebi Mi 2\ :sup:`20`
kibi Ki 2\ :sup:`10`
\ 1
===== ====== ======
"""
DECIMAL_PREFIXES = [
Prefix('yotta', 'Y', 10**24),
Prefix('zetta', 'Z', 10**21),
Prefix('exa', 'E', 10**18),
Prefix('peta', 'P', 10**15),
Prefix('tera', 'T', 10**12),
Prefix('giga', 'G', 10**9),
Prefix('mega', 'M', 10**6),
Prefix('kilo', 'k', 10**3),
Prefix('hecto', 'h', 10**2),
Prefix('deca', 'da', 10**1),
NO_PREFIX,
Prefix('deci', 'd', 10**-1),
Prefix('centi', 'c', 10**-2),
Prefix('milli', 'm', 10**-3),
Prefix('micro', 'µ', 10**-6),
Prefix('nano', 'n', 10**-9),
Prefix('pico', 'p', 10**-12),
Prefix('femto', 'f', 10**-15),
Prefix('atto', 'a', 10**-18),
Prefix('zepto', 'z', 10**-21),
Prefix('yocto', 'y', 10**-24),
]
r"""List with decimal prefixes.
The entries in this list are of type :class:`Prefix`.
===== ====== ======
Name Symbol Factor
===== ====== ======
yotta Y 10\ :sup:`24`
zetta Z 10\ :sup:`21`
exa E 10\ :sup:`18`
peta P 10\ :sup:`15`
tera T 10\ :sup:`12`
giga G 10\ :sup:`9`
mega M 10\ :sup:`6`
kilo k 10\ :sup:`3`
hecto h 10\ :sup:`2`
deca da 10\ :sup:`1`
\ 1
deci d 10\ :sup:`-1`
centi c 10\ :sup:`-2`
milli m 10\ :sup:`-3`
micro µ 10\ :sup:`-6`
nano n 10\ :sup:`-9`
pico p 10\ :sup:`-12`
femto f 10\ :sup:`-15`
atto a 10\ :sup:`-18`
zepto z 10\ :sup:`-21`
yocto y 10\ :sup:`-24`
===== ====== ======
"""
[docs]def bin_prefix(value):
"""Get an appropriate binary prefix for an integer number.
:param int value: the number
:return: binary prefix
:rtype: Prefix
:raises TypeError: if value is not an integer
"""
check_type(value, int, 'value')
if value == 0:
return NO_PREFIX
value = abs(value)
for p in BINARY_PREFIXES:
if value / p.factor >= 1.0:
return p
[docs]def find_bin_prefix(s):
"""Find binary prefix for name or symbol.
:param str s: name (case-insensitive) or symbol (case-sensitive)
:return: binary prefix or ``None`` if not found
:rtype: Prefix or None
"""
for p in BINARY_PREFIXES:
if s.lower() == p.name or s == p.symbol:
return p
[docs]def dec_prefix(value, restricted=True):
"""Get an appropriate decimal prefix for a number.
:param value: the number
:type value: int or float
:param bool restricted: if ``True`` only integer powers of 1000 are used,
i.e. *hecto, deca, deci, centi* are skipped
:return: decimal prefix
:rtype: Prefix
:raises TypeError: if value is not of type int or float
"""
check_type(value, (int, float), 'value')
if value == 0:
return NO_PREFIX
value = abs(value)
for p in DECIMAL_PREFIXES:
if restricted and p.name in ('hecto', 'deca', 'deci', 'centi'):
continue
if value / p.factor >= 1.0:
return p
return p
[docs]def find_dec_prefix(s):
"""Find decimal prefix for name or symbol.
:param str s: name (case-insensitive) or symbol (case-sensitive);
instead of the symbol ``µ`` the letter ``u`` can be used
:return: decimal prefix or ``None`` if not found
:rtype: Prefix or None
"""
if s == 'u':
s = 'µ'
for p in DECIMAL_PREFIXES:
if s.lower() == p.name or s == p.symbol:
return p
_TIMEDELTA_FRMT_RE = re.compile(
r'%(?P<flag>[0 ]?)(?P<width>\d*)(?P<code>[DHMSs])')
_TdFmtTuple = namedtuple('TdFmtTuple', 'text, flag, width, code')
@lru_cache(maxsize=32)
def _timedelta_format(fmt_str):
replacements = [('%X', '%02H:%02M:%02S'),
('%Y', '%02H:%02M'),
('%Z', '%02M:%02S')]
for s, r in replacements:
fmt_str = fmt_str.replace(s, r)
lst = []
pos = 0
units = ('s', 'S', 'M', 'H', 'D')
max_unit = 's'
for m in _TIMEDELTA_FRMT_RE.finditer(fmt_str):
if m['width'] and int(m['width']) < 1:
raise ValueError('width must be > 0')
lst.append(_TdFmtTuple(fmt_str[pos:m.start()], m['flag'],
m['width'], m['code']))
pos = m.end()
max_unit = units[max(units.index(max_unit), units.index(m['code']))]
lst.append(fmt_str[pos:])
return lst, max_unit
[docs]def parse_timedelta(string, fmt_str):
"""Parse a string as a time delta according to a format.
The time delta is the difference between two points in time,
e.g. a duration.
.. note::
The :ref:`format specifier <ref-timedelta-format-specifiers>`
``'%s'`` for microseconds can only be used for strings that are the
fractional part of a second. The string ``'1'`` is ``100000 µs``;
but ``'000001'`` is ``1 µs``.
>>> parse_timedelta('03:21.001', '%M:%02S.%s')
datetime.timedelta(seconds=201, microseconds=1000)
>>> str(_)
'0:03:21.001000'
:param str s: the string
:param str fmt_str: string with
:ref:`format specifiers <ref-timedelta-format-specifiers>`
:return: timedelta object
:rtype: datetime.timedelta
:raises ValueError: if the string cannot be parsed
"""
fmt, _ = _timedelta_format(fmt_str)
lst = []
for x in fmt:
if isinstance(x, tuple):
s = '%(text)s'
if x.width and int(x.width) > 1:
s += r'(?P<%(code)s>[ \d]{' + x.width + '})'
else:
s += r'(?P<%(code)s>\d+)'
lst.append(s % x._asdict())
else:
lst.append(x)
m = re.fullmatch(''.join(lst), string)
if m:
args = {'s': 0, 'S': 0, 'M': 0, 'H': 0, 'D': 0}
for k, v in m.groupdict().items():
if k == 's':
s = v.lstrip()
args[k] = round(int(s) * 10**(6 - len(s)))
else:
args[k] = int(v)
return timedelta(days=args['D'], hours=args['H'], minutes=args['M'],
seconds=args['S'], microseconds=args['s'])
else:
raise ValueError('time data %r does not match format %r' %
(string, fmt_str))
[docs]def is_hexdigit(s):
"""Check if all characters are hexadecimal digits.
:param str s: the string
:return: ``True`` if all characters in the string are hexadecimal digits
and there is at least one character
:rtype: bool
.. versionadded:: 0.5.0
"""
return bool(s) and all(x in string.hexdigits for x in s)
_DIGITS = tuple(map(str, range(10))) + tuple(string.ascii_lowercase)
[docs]def int2str(n, base):
"""Convert an integer to a string.
For ``base > 10`` lower case letters are used for digits.
See also the built-in functions :func:`bin`, :func:`oct`, :func:`hex`.
:param int n: the integer
:param int base: the base (``2 <= base <= 36``)
:return: converted integer
:rtype: str
:raises TypeError: if ``n`` or ``base`` are not integers
:raises ValueError: if ``base`` is outside the allowed range
.. versionadded:: 0.7.0
"""
check_type(n, int, 'n')
check_type(base, int, 'base')
if base < 2 or base > 36:
raise ValueError('base must be >= 2 and <= 36')
if base == 10:
return str(n)
s = ''
if not n:
return '0'
sign = '-' if n < 0 else ''
n = abs(n)
while n > 0:
n, r = divmod(n, base)
s = _DIGITS[r] + s
return sign + s
_SLUGIFY_CHAR_MAP = str.maketrans({' ': '-', '\t': '-', '\n': '-', "'": '-',
'ß': 'ss', 'æ': 'ae', 'œ': 'oe', 'ø': 'o',
'þ': 'th', 'ð': 'd'})
[docs]def slugify(s):
"""Slugify a string.
In the created `slug
<https://en.wikipedia.org/wiki/Clean_URL#Slug>`_ white space
characters are replaced by dashes (``-``) and all consecutive
dashes but one are eliminated.
:param str s: the string
:return: slugified string
:rtype: str
.. versionadded:: 0.9.0
"""
s = s.lower().translate(_SLUGIFY_CHAR_MAP)
s = unicodedata.normalize('NFD', s)
s = re.sub(b'[^a-zA-Z0-9-]', b'', s.encode()).decode()
s = re.sub('-+', '-', s).strip('-')
return s
[docs]class TranslationTable:
"""TranslationTable class.
This class is for use with :meth:`str.translate`. If a character is not
in ``mapped_chars``, ``unmapped_chars``, or ``delete_chars``
a :exc:`ValueError` will be raised. See also: :meth:`str.maketrans`.
:param mapped_chars: mapping from character to replacement string
:type mapped_chars: dict(str, str)
:param str unmapped_chars: string with characters that will not be replaced
:param str delete_chars: string with characters that will be deleted
:return: the resulting string
:rtype: str
:raises ValueError: if a character is not allowed
.. versionadded:: 0.9.0
"""
def __init__(self, mapped_chars, unmapped_chars='', delete_chars=''):
self._mapped_chars = {ord(k): v for k, v in mapped_chars.items()}
self._unmapped_chars = [ord(ch) for ch in unmapped_chars]
self._delete_chars = [ord(ch) for ch in delete_chars]
def __getitem__(self, n):
if n in self._mapped_chars:
return self._mapped_chars[n]
if n in self._unmapped_chars:
raise LookupError
if n in self._delete_chars:
return None
raise ValueError('character "%s" (%d) is not allowed' % (chr(n), n))
[docs]def wlen(text):
r"""Determine the number of columns a string actually occupies in a terminal.
Though most characters need just one column there are characters that use
zero (e.g. NULL) or two (e.g. Japanese) columns. The :func:`len` function
counts the number of characters in a string.
Examples::
>>> s1 = 'a\0b'
>>> s2 = 'は日本'
>>> len(s1)
3
>>> len(s2)
3
>>> print(s1)
ab
>>> print(s2)
は日本
>>> wlen(s1)
2
>>> wlen(s2)
6
:param str text: the string
:return: number of columns
:rtype: int
:raises ValueError: if length could not be determined
.. versionadded:: 0.15.0
"""
w = wcswidth(text)
if w == -1:
raise ValueError('length could not be determined')
return w
[docs]def walign(text, width, fill=' ', align='center'):
"""Align a string even if not all characters occupy only one column\
in a terminal.
The string methods :meth:`~str.center`, :meth:`~str.ljust`,
and :meth:`~str.rjust` only consider the number of characters when
aligning a string. If a string contains characters that occupy
zero or two columns (see :func:`wlen`) the result is wrong when
displayed in a terminal::
s = 'abcde'
w = 'は日本競馬'
wlen(s) -> 5
wlen(w) -> 10
s.center(12, '.') -> 12 columns (correct)
w.center(12, '.') -> 17 columns (wrong)
walign(s, 12, '.', 'center') -> 12 columns (correct)
walign(w, 12, '.', 'center') -> 12 columns (correct)
:param str text: the string
:param int width: number of columns wherein to align the string
:param str fill: fill character
:param str align: alignment (one of ``'center', 'left', 'right``)
:return: aligned string (if ``wlen(text) >= width`` the original
string is returned)
:raises ValueError: if length could not be determined,
``align`` is unknown or ``wlen(fill) != 1``
.. versionadded:: 0.15.0
"""
if wcswidth(fill) != 1:
raise ValueError('wlen(fill) must be 1')
if align not in ('center', 'left', 'right'):
raise ValueError(f'unknown alignment {align!r}')
w = wlen(text)
if w >= width:
return text
d = width - w
if align == 'center':
return d // 2 * fill + text + (width - w - d // 2) * fill
if align == 'left':
return text + (width - w) * fill
if align == 'right':
return (width - w) * fill + text
[docs]def wshorten(text, width=80, placeholder='…', pos='right'):
"""Shorten the text to fit in the given width even if not all characters\
occupy only one column in a terminal.
:param str text: the text
:param int width: the width
:param str placeholder: the placeholder
:param str pos: position (``'left', 'middle', 'right'``) of
placeholder in text
:return: the shortened text (may be one character less then ``width``)
:rtype: str
:raises ValueError: if length could not be determined,
``width < wlen(placeholder)`` or ``pos`` is unknown
.. versionadded:: 0.15.0
"""
w = wlen(placeholder)
if width < w:
raise ValueError('width must be >= wlen(placeholder)')
if pos not in ('left', 'right', 'middle'):
raise ValueError(f'unknown pos: {pos}')
if wlen(text) <= width:
return text
if pos == 'right':
result = _slice(text, width - w) + placeholder
elif pos == 'left':
result = placeholder + _slice(text, -width + w)
else:
result = _slice(text, math.ceil((width - w) / 2)) + placeholder
result += _slice(text, -width + wlen(result))
return result
def _slice(string, length):
if not length:
return ''
if abs(length) >= wlen(string):
return string
if length < 0:
s = list(reversed(string))
l = abs(length)
else:
s = string
l = length
for i, _ in enumerate(s, 1):
if wlen(s[:i]) > l:
result = s[:i - 1]
break
elif wlen(s[:i]) == l:
result = s[:i]
break
if length < 0:
return ''.join(reversed(result))
return result