-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
85 lines (62 loc) · 2.94 KB
/
Copy pathpreprocessing.py
File metadata and controls
85 lines (62 loc) · 2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import numpy as np
import string
import re
class PreprocessingUnit:
def __init__(self, file, enc='utf-8'):
self.text = file
with open(self.text, 'rt', encoding=enc) as file:
self.text = file.read()
self.encoding = enc
self.vocab_size = 0
self.token_to_idx = {}
self.idx_to_token = {}
self.text_as_int = np.empty(1)
self.inputs = None
self.targets = None
self.tokens = None
def lower_case(self):
self.text = self.text.lower()
print("All characters are lowercase")
def upper_case(self):
self.text = self.text.upper()
print("All characters are upper case")
def remove_punctuation(self):
self.text = self.text.translate(str.maketrans("", "", string.punctuation))
print("All punctuation has been removed")
def remove_newlines(self):
self.text = self.text.replace('\n', ' ').replace('\r', '')
print("All newlines have been removed")
def build_vocab(self):
# Create a list of unique characters in the text
chars = sorted(list(set(self.text)))
self.vocab_size = len(chars)
# Create character to integer and integer to character mappings
self.token_to_idx = {char: idx for idx, char in enumerate(chars)}
self.idx_to_token = {idx: char for idx, char in enumerate(chars)}
# Convert the entire text into a list of integer indices
self.text_as_int = np.array([self.token_to_idx[char] for char in self.text])
print("Text has been translated to integers")
print(f"Corpus length: {len(self.text)}")
print(f"Vocabulary size: {self.vocab_size}")
def create_batches(self, seq_length, batch_size):
"""
Create batches of sequences and corresponding targets
"""
# Calculate how many full batches we can make from the dataset
n_batches = len(self.text_as_int) // (batch_size * seq_length)
# Trim the text to fit exactly into full batches
text_trimmed = self.text_as_int[:n_batches * batch_size * seq_length + 1]
# Initialize inputs and targets arrays
self.inputs = np.zeros((n_batches, batch_size, seq_length), dtype=int)
self.targets = np.zeros((n_batches, batch_size, seq_length), dtype=int)
# Loop through each batch and fill in the input and target sequences
for i in range(n_batches):
for j in range(batch_size):
# Find starting index for the batch sequence
start_idx = i * batch_size * seq_length + j * seq_length
# Fill input sequence
self.inputs[i, j, :] = text_trimmed[start_idx:start_idx + seq_length]
# Fill target sequence
self.targets[i, j, :] = text_trimmed[start_idx + 1:start_idx + seq_length + 1]
print("Batch creation successful")
print(f"Batches created: {n_batches}")