import tarfileimport refrom helpers import downloadclass ImdbMovieReviews:
DEFAULT_URL = \
'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
TOKEN_REGEX = re.compile(r
'[A-Za-z]+|[!?.:,()]'
)def __init__(self, cache_dir, url=None):
self._cache_dir = cache_dir
self._url = url
or
type(self).DEFAULT_URLdef __iter__(self):
filepath = download(self._url, self._cache_dir)
with tarfile.open(filepath)
as
archive:
for
filename in archive.getnames():
if
filename.startswith(
'aclImdb/train/pos/'
):yield self._read(archive, filename), Trueelif filename.startswith(
'aclImdb/train/neg/'
):yield self._read(archive, filename), Falsedef _read(self, archive, filename):
with archive.extractfile(filename)
as
file_:
data = file_.read().decode(
'utf-8'
)
data = type(self).TOKEN_REGEX.findall(data)
data = [x.lower()
for
x in data]
return
dataimport bz2import numpy
as
npclass Embedding:def __init__(self, vocabulary_path, embedding_path, length):
self._embedding = np.load(embedding_path)
with bz2.open(vocabulary_path,
'rt'
)
as
file_:
self._vocabulary = {k.strip(): i
for
i, k in enumerate(file_)}
self._length = lengthdef __call__(self, sequence):
data = np.zeros((self._length, self._embedding.shape[1]))
indices = [self._vocabulary.get(x, 0)
for
x in sequence]
embedded = self._embedding[indices]
data[:len(sequence)] = embeddedreturn data
@propertydef dimensions(self):
return
self._embedding.shape[1]import tensorflow
as
tffrom helpers import lazy_propertyclass SequenceClassificationModel:def __init__(self, data, target, params):
self.data = data
self.target = target
self.params = params
self.prediction
self.cost
self.error
self.optimize
@lazy_propertydef length(self):
used = tf.sign(tf.reduce_max(tf.
abs
(self.data), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return
length
@lazy_propertydef prediction(self):# Recurrent network.output, _ = tf.nn.dynamic_rnn(
self.params.rnn_cell(self.params.rnn_hidden),
self.data,
dtype=tf.float32,
sequence_length=self.length,
)
last = self._last_relevant(output, self.length)# Softmax layer.num_classes = int(self.target.get_shape()[1])
weight = tf.Variable(tf.truncated_normal(
[self.params.rnn_hidden, num_classes], stddev=0.01))
bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))
prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
return
prediction
@lazy_propertydef cost(self):
cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction))
return
cross_entropy
@lazy_propertydef error(self):
mistakes = tf.not_equal(
tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
return
tf.reduce_mean(tf.cast(mistakes, tf.float32))
@lazy_propertydef optimize(self):
gradient = self.params.optimizer.compute_gradients(self.cost)
try
:
limit = self.params.gradient_clipping
gradient = [
(tf.clip_by_value(g, -limit, limit), v)
if
g is not None
else
(None, v)
for
g, v in gradient]except AttributeError:
print
(
'No gradient clipping parameter specified.'
)
optimize = self.params.optimizer.apply_gradients(gradient)
return
optimize
@staticmethoddef _last_relevant(output, length):
batch_size = tf.shape(output)[0]
max_length = int(output.get_shape()[1])
output_size = int(output.get_shape()[2])
index = tf.range(0, batch_size) * max_length + (length - 1)
flat = tf.reshape(output, [-1, output_size])
relevant = tf.gather(flat, index)
return
relevantimport tensorflow
as
tffrom helpers import AttrDictfrom Embedding import Embeddingfrom ImdbMovieReviews import ImdbMovieReviewsfrom preprocess_batched import preprocess_batchedfrom SequenceClassificationModel import SequenceClassificationModel
IMDB_DOWNLOAD_DIR =
'./imdb'
WIKI_VOCAB_DIR =
'../01_wikipedia/wikipedia'
WIKI_EMBED_DIR =
'../01_wikipedia/wikipedia'
params = AttrDict(
rnn_cell=tf.contrib.rnn.GRUCell,
rnn_hidden=300,
optimizer=tf.train.RMSPropOptimizer(0.002),
batch_size=20,
)
reviews = ImdbMovieReviews(IMDB_DOWNLOAD_DIR)
length = max(len(x[0])
for
x in reviews)
embedding = Embedding(
WIKI_VOCAB_DIR +
'/vocabulary.bz2'
,
WIKI_EMBED_DIR +
'/embeddings.npy'
, length)
batches = preprocess_batched(reviews, length, embedding, params.batch_size)
data = tf.placeholder(tf.float32, [None, length, embedding.dimensions])
target = tf.placeholder(tf.float32, [None, 2])
model = SequenceClassificationModel(data, target, params)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for
index, batch in enumerate(batches):
feed = {data: batch[0], target: batch[1]}
error, _ = sess.run([model.error, model.optimize], feed)
print
(
'{}: {:3.1f}%'
.format(index + 1, 100 * error))