Vocabulary

The Vocab class maps tokens to integers and vice versa. It also can count the occurrences of tokens during tokenization.

Constructor

`CLASS` unitok.Vocab()

Parameter	Default	Type	Example	Description
`name`	`N/A`	str	`'tokens'`	The name of the vocabulary. It should be unique in the each space.

from unitok import Vocab, UniTok

vocab = Vocab(name='tokens')  # OK, tokens vocab in the default space
fruits = Vocab(name='fruits')  # OK, fruits vocab in the default space
another_fruits = Vocab(name='fruits')  # ValueError: Conflict object declaration

with UniTok() as ut:
    more_fruits = Vocab(name='fruits')  # OK, fruits vocab in the ut space

Attributes

Attribute	Type	Description
`_name`	str	The name of the vocabulary.
`_editable`	bool	Whether the vocabulary is editable.
`o2i`	unitok.utils.Map	A mapping from tokens to integers.
`i2o`	unitok.utils.Map	A mapping from integers to tokens.
`counter`	unitok.vocabulary.Counter	Counts the occurrences of tokens during tokenization.

Properties

`PROPERTY` name -> str

The name of the vocabulary.

from unitok import Vocab

vocab = Vocab(name='tokens')
print(vocab.name)  # 'tokens'

`PROPERTY` size -> int

The size of the vocabulary. Same as __len__.

from unitok import Vocab

vocab = Vocab(name='tokens')
print(vocab.size)  # 0

`PROPERTY` editable -> bool

Whether the vocabulary is editable.

from unitok import Vocab

vocab = Vocab(name='tokens')
print(vocab.editable)  # True

`PROPERTY` filename -> str

The filename of the vocabulary.

from unitok import Vocab

vocab = Vocab(name='tokens')
print(vocab.filename)  # 'tokens.vocab'

Magic Methods

`METHOD` iter() -> Iterator[str]

Iterates over the tokens in the vocabulary.

from unitok import Vocab

vocab = Vocab(name='tokens')
vocab.extend(['apple', 'banana', 'cherry'])

for token in vocab:
    print(token)  # 'apple', 'banana', 'cherry'

`METHOD` getitem() -> Union[str, int]

Returns the token by index or the index by token.

Parameter	Default	Type	Example	Description
`item`	`N/A`	Union[str, int]	`'apple'`	The token or index.

from unitok import Vocab

vocab = Vocab(name='tokens')
vocab.extend(['apple', 'banana', 'cherry'])

print(vocab[0])  # 'apple'
print(vocab['apple'])  # 0

`METHOD` contains() -> bool

Checks if a token is in the vocabulary.

Parameter	Default	Type	Example	Description
`item`	`N/A`	str	`'apple'`	The token to check.

from unitok import Vocab

vocab = Vocab(name='tokens')
vocab.extend(['apple', 'banana', 'cherry'])

print('apple' in vocab)  # True
print('orange' in vocab)  # False

`METHOD` len() -> int

Returns the size of the vocabulary. Same as size.

from unitok import Vocab

vocab = Vocab(name='tokens')
vocab.extend(['apple', 'banana', 'cherry'])

print(len(vocab))  # 3

Methods

`METHOD` append() -> int

Appends a token to the vocabulary, returning the token's index.

Parameter	Default	Type	Example	Description
`obj`	`N/A`	str	`'apple'`	The token to append.
`oov_token`	`None`	Optional[int]	`0`	The out-of-vocabulary token when the vocab is not editable.

from unitok import Vocab

vocab = Vocab(name='tokens')
vocab.append('apple')  # returns 0
vocab.append('banana') # returns 1

vocab.deny_edit()  # make the vocab read-only

vocab.append('apple')  # 0
vocab.append('orange')  # ValueError
vocab.append('orange', oov_token='apple')  # 0

`METHOD` extend() -> List[int]

Extends the vocabulary with a list of tokens, returning the indices of the tokens.

Parameter	Default	Type	Example	Description
`objs`	`N/A`	List[str]	`['apple', 'banana', 'cherry']`	The tokens to append.

from unitok import Vocab

vocab = Vocab(name='tokens')
vocab.extend(['apple', 'banana', 'cherry'])  # [0, 1, 2]

`METHOD` deny_edit() -> self

Prevents the vocabulary from being extended.

from unitok import Vocab   

vocab = Vocab(name='tokens')
vocab.deny_edit()

vocab.append('apple')  # ValueError

`METHOD` allow_edit() -> self

Allows the vocabulary to be extended.

from unitok import Vocab

vocab = Vocab(name='tokens')
vocab.deny_edit()

try:
    vocab.append('apple')
except ValueError:
    vocab.allow_edit()
    vocab.append('orange')

print(list(vocab))  # ['orange']

`METHOD` trim() -> self

Trims the vocabulary by removing tokens with frequencies below a threshold.

Parameter	Default	Type	Example	Description
`min_count`	`N/A`	int	`2`	The minimum frequency.

from unitok import Vocab

vocab = Vocab(name='tokens')

vocab.counter.activate().initialize()
vocab.extend(['apple', 'banana', 'cherry', 'apple', 'banana'])

vocab.trim(min_count=2)
print(list(vocab))  # ['apple', 'banana']

`METHOD` summarize() -> Dict[tuple, int]

Summarizes the vocabulary by counting the occurrences of tokens.

Parameter	Default	Type	Example	Description
`base`	10	int	`10`	The base of the logarithm.

from unitok import Vocab

vocab = Vocab(name='tokens')
vocab.counter.activate().initialize()

vocab.extend(['apple', 'apple', 'apple', 'apple', 'banana', 'banana', 'cherry', 'orange'])
vocab.summarize(base=2)  # {(1, 2): 2, (2, 4): 1, (4, 8): 1}

# The left bound of the key is inclusive, and the right bound is exclusive.
# (1, 2): 2, there are 2 tokens with a frequency at 1, i.e., 'cherry' and 'orange'.
# (2, 4): 1, there is 1 token with a frequency between 2 and 3, i.e., 'banana'.
# (4, 8): 1, there is 1 token with a frequency between 4 and 7, i.e., 'apple'.

`METHOD` filepath() -> str

Returns the filepath of the vocabulary.

Parameter	Default	Type	Example	Description
`save_dir`	`N/A`	str	`'.'`	The directory path.

from unitok import Vocab

vocab = Vocab(name='tokens')
print(vocab.filepath('fruit'))  # 'fruit/tokens.vocab'

`METHOD` save() -> self

Saves the vocabulary to a file.

Parameter	Default	Type	Example	Description
`save_dir`	`N/A`	str	`'.'`	The directory path.

from unitok import Vocab

vocab = Vocab(name='tokens')
vocab.extend(['apple', 'banana', 'cherry'])
vocab.save('fruit')

`METHOD` load() -> self

Loads the vocabulary from a file.

Parameter	Default	Type	Example	Description
`save_dir`	`N/A`	str	`'.'`	The directory path.

from unitok import Vocab

vocab = Vocab(name='tokens').load('fruit')
print(list(vocab))  # ['apple', 'banana', 'cherry']

`METHOD` json() -> dict

Returns the metadata of the vocabulary

from unitok import Vocab

vocab = Vocab(name='tokens')
vocab.extend(['apple', 'banana', 'cherry'])

print(vocab.json())  # {'name': 'tokens', 'vocab_size': 3}

Vocabulary

Constructor

CLASS unitok.Vocab()

Attributes

Properties

PROPERTY name -> str

PROPERTY size -> int

PROPERTY editable -> bool

PROPERTY filename -> str

Magic Methods

METHOD __iter__() -> Iterator[str]

METHOD __getitem__() -> Union[str, int]

METHOD __contains__() -> bool

METHOD __len__() -> int

Methods

METHOD append() -> int

METHOD extend() -> List[int]

METHOD deny_edit() -> self

METHOD allow_edit() -> self

METHOD trim() -> self

METHOD summarize() -> Dict[tuple, int]

METHOD filepath() -> str

METHOD save() -> self

METHOD load() -> self

METHOD json() -> dict