# -*- coding:utf-8 -*-
"""
Created on the 02/05/2018
@author: Nicolas Thiebaut
@email: nicolas@visage.jobs
"""
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.base import BaseEstimator
from sklearn.pipeline import TransformerMixin
[docs]class TextsToSequences(Tokenizer, BaseEstimator, TransformerMixin):
""" Sklearn transformer to convert texts to indices list
Example
-------
>>> from zeugma import TextsToSequences
>>> sequencer = TextsToSequences()
>>> sequencer.fit_transform(["the cute cat", "the dog"])
[[1, 2, 3], [1, 4]]
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
[docs] def fit(self, texts, y=None):
self.fit_on_texts(texts)
return self
[docs] def transform(self, texts, y=None):
return np.array(self.texts_to_sequences(texts))
[docs]class Padder(BaseEstimator, TransformerMixin):
""" Pad and crop uneven lists to the same length.
Only the end of lists longer than the max_length attribute are
kept, and lists shorter than max_length are left-padded with zeros
Attributes
----------
max_length: int
sizes of sequences after padding
max_index: int
maximum index known by the Padder, if a higher index is met during
transform it is transformed to a 0
"""
def __init__(self, max_length=500):
self.max_length = max_length
self.max_index = None
[docs] def fit(self, X, y=None):
self.max_index = pad_sequences(X, maxlen=self.max_length).max()
return self
if __name__ == "__main__":
import doctest
doctest.testmod()