FAQ

1. GloVe Vocabulary Construction

How to construct a GloVe vocabulary for GloVeTokenizer?
import os
import subprocess
import zipfile

from unitok import Vocab


DL_URI = 'https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip'
ZIP_NAME = './glove.6B.zip'
FILE_NAME = './glove.6B.300d.txt'


def get_glove_vocabulary():
    vocab = Vocab(name='glove')
    if os.path.join(vocab.filepath('.')):
        return vocab.load('.')

    if not os.path.exists(ZIP_NAME):
        """Download GloVe embeddings from the specified URL."""
        print(f"Downloading GloVe embeddings from {DL_URI}...")
        subprocess.run(["curl", "-o", ZIP_NAME, DL_URI], check=True)
        print(f"GloVe downloaded to {ZIP_NAME}")

    if not os.path.exists(FILE_NAME):
        """Unzip the GloVe file."""
        print(f"Unzipping GloVe embeddings...")
        with zipfile.ZipFile(ZIP_NAME, 'r') as zip_ref:
            zip_ref.extractall(FILE_NAME)
        print(f"Unzipped to {FILE_NAME}")

    """Extract unique tokens (words) from the GloVe file."""
    with open(FILE_NAME, 'r', encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            vocab.append(word)

    vocab.save(FILE_NAME)