from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import glob
import io
import gzip
import re
import itertools

# Dependency imports

import numpy as np
import six
import subprocess

from tensor2tensor.data_generators import generator_utils
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry

import tensorflow as tf

# Not actually chopped, just stealing their parallelism
class ParallelText2SelfProblem(text_problems.ChoppedTextProblem):
    @property
    def has_inputs(self):
        return False
    
    @property
    def lines_per_file_for_vocab(self):
        return 10000

    def max_length(self, model_hparams):
        return super(text_problems.ChoppedTextProblem, self).max_length(model_hparams)

    def filepath_to_unicode_strings(self, filepath):
        file = io.TextIOWrapper(gzip.open(filepath), encoding='utf_8', errors='ignore') if filepath.endswith('.gz') else io.open(filepath, encoding='utf_8', errors='ignore')
        with file as f:
            for line in f:
                yield self.preprocess_item(line)
    
    def preprocess_item(self, line):
        return line.rstrip()
    
    def generate_text_for_vocab(self, data_dir, tmp_dir):
        for file in self.train_text_filepaths(tmp_dir):
            for line in itertools.islice(self.filepath_to_unicode_strings(file), self.lines_per_file_for_vocab):
                yield line
    def example_generator(self, encoder, tmp_dir, task_id):
        """Generator for examples.

        Args:
        encoder: a TextEncoder
        tmp_dir: a string
        task_id: an integer
        Yields:
        feature dictionaries
        """
        filepaths = self.text_filepaths_for_task(tmp_dir, task_id)
        for ftext in self.file_generator(
                filepaths, max_chars_per_file=None):
            yield {"targets": encoder.encode(ftext)}


UNICODE_ESCAPE_REGEX = re.compile(
    r"""\\(?:(?P<esc>[^u])|u(?P<open>\{)?(?P<code>(?(open)[^\}]+|\w{4}))(?(open)\}))""", re.VERBOSE)
ESC_MAP = {'n': ' ', 'r': ' ', 't': ' '}


def _decode_escape(m):
    charcode = m.group('code')
    if charcode:
        return six.unichr(int(charcode, 16))
    esc = m.group('esc')
    return ESC_MAP.get(esc, esc)


def decode_line(line):  # Handle
    line = line[1:-2]
    line = UNICODE_ESCAPE_REGEX.sub(_decode_escape, line)
    return line


@registry.register_problem
class GigaTwitter(ParallelText2SelfProblem):
    def train_text_filepaths(self, tmp_dir):
        return glob.glob(os.path.join(tmp_dir, "train", "*"))

    def dev_text_filepaths(self, tmp_dir):
        return glob.glob(os.path.join(tmp_dir, "test", "*"))


@registry.register_problem
class GigaRaw(ParallelText2SelfProblem):
    @property
    def approx_vocab_size(self):
        return 131072;


    @property
    def max_chars_for_vocab(self):
        """Number of characters of training data to use for generating vocab."""
        return None

    def preprocess_item(self, line):
        return decode_line(line)

    def train_text_filepaths(self, tmp_dir):
        return glob.glob(os.path.join(tmp_dir, "train", "*"))

    def dev_text_filepaths(self, tmp_dir):
        return glob.glob(os.path.join(tmp_dir, "test", "*"))

