Source code for zeugma.keras_transformers

# -*- coding:utf-8 -*-
Created on the 02/05/2018
@author: Nicolas Thiebaut
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.base import BaseEstimator
from sklearn.pipeline import TransformerMixin

[docs]class TextsToSequences(Tokenizer, BaseEstimator, TransformerMixin): """ Sklearn transformer to convert texts to indices list Example ------- >>> from zeugma import TextsToSequences >>> sequencer = TextsToSequences() >>> sequencer.fit_transform(["the cute cat", "the dog"]) [[1, 2, 3], [1, 4]] """ def __init__(self, **kwargs): super().__init__(**kwargs)
[docs] def fit(self, texts, y=None): self.fit_on_texts(texts) return self
[docs] def transform(self, texts, y=None): return np.array(self.texts_to_sequences(texts))
[docs]class Padder(BaseEstimator, TransformerMixin): """ Pad and crop uneven lists to the same length. Only the end of lists longer than the max_length attribute are kept, and lists shorter than max_length are left-padded with zeros Attributes ---------- max_length: int sizes of sequences after padding max_index: int maximum index known by the Padder, if a higher index is met during transform it is transformed to a 0 """ def __init__(self, max_length=500): self.max_length = max_length self.max_index = None
[docs] def fit(self, X, y=None): self.max_index = pad_sequences(X, maxlen=self.max_length).max() return self
[docs] def transform(self, X, y=None): X = pad_sequences(X, maxlen=self.max_length) X[X > self.max_index] = 0 return X
if __name__ == "__main__": import doctest doctest.testmod()