Source code for textworld.gym.spaces.text_spaces

import re
import string
import numpy as np

import gym
import gym.spaces


[docs]class VocabularyHasDuplicateTokens(ValueError):
    pass


[docs]class Char(gym.spaces.MultiDiscrete):
    """ Character observation/action space

    This space consists of a series of `gym.spaces.Discrete` objects all with
    the same parameters. Each `gym.spaces.Discrete` can take integer values
    between 0 and len(self.vocab).

    Notes
    -----
    The following special token will be prepended (if needed) to the vocabulary:

      * '#' : Padding token

    """

    def __init__(self, max_length, vocab=None, extra_vocab=[]):
        """
        Parameters
        ----------
        max_length : int
            Maximum number of characters in a text.
        vocab : list of char, optional
            Vocabulary defining this space. It shouldn't contain any
            duplicate characters. If not provided, the vocabulary will consists
            in characters [a-z0-9], punctuations [" ", "-", "'"] and padding '#'.
        extra_vocab : list of char, optional
            Additional tokens to add to the vocabulary.
        """
        if vocab is None:
            vocab = list(string.ascii_lowercase + string.digits)
            vocab += [" ", "-", "'"]

        vocab += extra_vocab

        if len(vocab) != len(set(vocab)):
            raise VocabularyHasDuplicateTokens()

        self.max_length = max_length
        self.PAD = "#"
        special_tokens = [self.PAD]
        self.vocab = [t for t in special_tokens if t not in vocab]
        self.vocab += list(vocab)
        self.vocab_set = set(self.vocab)  # For faster lookup.
        self.vocab_size = len(self.vocab)
        self.id2c = {i: c for i, c in enumerate(self.vocab)}
        self.c2id = {c: i for i, c in self.id2c.items()}
        self.PAD_id = self.c2id[self.PAD]
        super().__init__([len(self.vocab) - 1] * self.max_length)
        self.dtype = np.int64  # Overwrite Gym's dtype=int8.

[docs]    def filter_unknown(self, text):
        """ Strip out all characters not in the vocabulary. """
        return "".join(c for c in text if c in self.vocab_set)

[docs]    def tokenize(self, text, padding=False):
        """ Tokenize characters found in the vocabulary.

        Note: text will be padded up to `self.max_length`.
        """
        text = self.filter_unknown(text.lower())
        ids = [self.c2id[c] for c in text]

        # Add padding.
        if padding:
            nb_pads = self.max_length - len(ids)
            msg = "Provided `max_length` was not large enough ({} chars).".format(len(ids))
            assert nb_pads >= 0, msg
            ids += [self.PAD_id] * nb_pads

        return np.array(ids)

    def __repr__(self):
        return "Character({})".format(self.max_length)


[docs]class Word(gym.spaces.MultiDiscrete):
    """ Word observation/action space

    This space consists of a series of `gym.spaces.Discrete` objects all with
    the same parameters. Each `gym.spaces.Discrete` can take integer values
    between 0 and `len(self.vocab)`.

    Notes
    -----
    The following special tokens will be prepended (if needed) to the vocabulary:

      * '<PAD>' : Padding
      * '<UNK>' : Unknown word
      * '<S>'   : Beginning of sentence
      * '</S>'  : End of sentence

    Example
    -------
    Let's create an action space that can be used with
    :py:meth:`textworld.gym.register_game <textworld.gym.utils.register_game>`.
    We are going to assume actions are short phrases up to 8 words long.

    >>> import textworld
    >>> gamefiles = ["/path/to/game.ulx", "/path/to/another/game.z8"]
    >>> vocab = textworld.vocab.extract_from(gamefiles)
    >>> vocab = sorted(vocab)  # Sorting the vocabulary, optional.
    >>> action_space = textworld.gym.text_spaces.Word(max_length=8, vocab=vocab)

    """

    def __init__(self, max_length, vocab):
        """
        Parameters
        ----------
        max_length : int
            Maximum number of words in a text.
        vocab : list of strings
            Vocabulary defining this space. It shouldn't contain any
            duplicate words.
        """
        if len(vocab) != len(set(vocab)):
            raise VocabularyHasDuplicateTokens()

        self.max_length = max_length
        self.PAD = "<PAD>"
        self.UNK = "<UNK>"
        self.BOS = "<S>"
        self.EOS = "</S>"
        self.SEP = "<|>"
        special_tokens = [self.PAD, self.UNK, self.EOS, self.BOS, self.SEP]
        self.vocab = [w for w in special_tokens if w not in vocab]
        self.vocab += list(vocab)
        self.vocab_set = set(self.vocab)  # For faster lookup.
        self.vocab_size = len(self.vocab)
        self.id2w = {i: w for i, w in enumerate(self.vocab)}
        self.w2id = {w: i for i, w in self.id2w.items()}
        self.PAD_id = self.w2id[self.PAD]
        self.UNK_id = self.w2id[self.UNK]
        self.BOS_id = self.w2id[self.BOS]
        self.EOS_id = self.w2id[self.EOS]
        self.SEP_id = self.w2id[self.SEP]
        super().__init__([len(self.vocab) - 1] * self.max_length)
        self.dtype = np.int64  # Overwrite Gym's dtype=int8.

[docs]    def tokenize(self, text, padding=False):
        """ Tokenize words found in the vocabulary.

        Note: text will be padded up to `self.max_length`.
        """
        text = text.lower()  # Work only with lowercase letters.
        # Find beginning and end of sentences.
        text = text.replace(".", " </S> <S> ")
        text = "<S> " + text + " </S>"

        # Strip out all non-alphabetic characters.
        text = text.replace("'", "")
        text = re.sub("[^a-z0-9 <S>/]", " ", text)
        # TODO: convert numbers to text?

        # Get words ids and replace unknown words with <UNK>.
        words = text.split()
        ids = [self.w2id.get(w, self.UNK_id) for w in words]

        # Add padding.
        if padding:
            nb_pads = self.max_length - len(ids)
            msg = "Provided `max_length` was not large enough ({} words).".format(len(ids))
            assert nb_pads >= 0, msg
            ids += [self.PAD_id] * nb_pads

        return np.array(ids)

    def __repr__(self):
        return "Word(L={}, V={})".format(self.max_length, self.vocab_size)