First, I did some preprocessing to get binary (black and white) images. For that I first converted images to grayscale using rgb2gray and then to binary using imbinarize in MATLAB.
Now that I have binary images, I just tweaked the following code to train my data:
The code is provided below. I also did some post-processing. It actually improved the result a lot. I took the predictions and calculated the Levenshtein distance between the predicted word and each word in the dictionary of the words in training set, and the replaced the predicted word with the word in the dictionary of the words in the training set with whom it has the smallest Levenshtein distance.
Code for postprocessing:
import pandas as pd
from Levenshtein import distance as levenshtein_distance
import numpy as np
xx=pd.read_csv(“predictions.csv”)
ocr_list=[]
for i in range(10000):
etwas=xx.iloc[i][‘label’]
etwas=str(etwas)
metwas=etwas.split()
ocr_list.append(metwas)
yy=pd.read_csv(“train.csv”)
list2=[]
for i in range(40000):
z=yy.iloc[i][‘label’]
z=str(z)
zlist=z.split()
for j in zlist:
list2.append(j)
def distancer(x,y):
index=0
mymin=levenshtein_distance(x,y[0])
for i in range(len(y)):
m=levenshtein_distance(y[i],x)
if m < mymin:
mymin=m
index=i
return y[index]
for i in range(len(ocr_list)):
if len(ocr_list[i])==1:
ocr_list[i][0]=distancer(ocr_list[i][0],list2)
if len(ocr_list[i])==2:
ocr_list[i][0]=distancer(ocr_list[i][0],list2)
ocr_list[i][1]=distancer(ocr_list[i][1],list2)
for i in range(10000):
if xx.iloc[i][‘label’]==‘nan’:
xx.at[i, ‘label’]=""
else:
xx.at[i,‘label’]=" ".join(ocr_list[i])
xx.to_csv(“submission.csv”, index=False)
Training code: From https://github.com/keras-team/keras-io/blob/master/examples/vision/captcha_ocr.py
#!/usr/bin/env python
coding: utf-8
In[28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from collections import Counter
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
import re
def atoi(text):
return int(text) if text.isdigit() else text
def natural_keys(text):
return [ atoi© for c in re.split(’(\d+)’,text) ]
In[29]:
train_data_dir=os.listdir("./bwtrain")
train_data_dir.sort(key=natural_keys)
x_train=train_data_dir
In[30]:
y_train=pd.read_csv(“train.csv”)[‘label’].tolist()
for i in range(len(y_train)):
y_train[i]=str(y_train[i])
characters=set(‘abcdefghijklmnopqrstuvwxyz’)
characters.add(’ ‘)
characters.add(’.’)
In[31]:
val_data_dir=os.listdir("./bwval")
val_data_dir.sort(key=natural_keys)
x_val=val_data_dir
In[32]:
y_val=pd.read_csv(“val.csv”)[‘label’].tolist()
for i in range(len(y_val)):
y_val[i]=str(y_val[i])
In[33]:
batch_size = 1
img_width = 256
img_height=256
downsample_factor = 4
max_length = max([len(label) for label in y_train])
In[34]:
for i in range(len(x_train)):
x_train[i]=f"./bwtrain/{i}.png"
for i in range(len(x_val)):
x_val[i]=f"./bwval/{i}.png"
x_train, y_train, x_val, y_val= np.array(x_train), np.array(y_train), np.array(x_val), np.array(y_val)
In[35]:
Mapping characters to integers
char_to_num = layers.experimental.preprocessing.StringLookup(
vocabulary=list(characters), num_oov_indices=0, mask_token=None
)
Mapping integers back to original characters
num_to_char = layers.experimental.preprocessing.StringLookup(
vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)
In[36]:
def encode_single_sample(img_path,label):
# 1. Read image
img = tf.io.read_file(img_path)
# 2. Decode and convert to grayscale
img = tf.io.decode_png(img, channels=0)
# 3. Convert to float32 in [0, 1] range
img = tf.image.convert_image_dtype(img, tf.float32)
# 4. Resize to the desired size
img = tf.image.resize(img, [img_height, img_width])
# 5. Transpose the image because we want the time
# dimension to correspond to the width of the image.
img = tf.transpose(img, perm=[1, 0, 2])
# 6. Map the characters in label to numbers
label = char_to_num(tf.strings.unicode_split(label, input_encoding=“UTF-8”))
# 7. Return a dict as our model is expecting two inputs
return {“image”: img, “label”: label}
In[37]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = (
train_dataset.map(
encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
)
.batch(batch_size)
.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)
validation_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
validation_dataset = (
validation_dataset.map(
encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
)
.batch(batch_size)
.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)
#print(list(train_dataset.as_numpy_iterator()))
In[38]:
class CTCLayer(layers.Layer):
def init(self, name=None):
super().init(name=name)
self.loss_fn = keras.backend.ctc_batch_cost
def call(self, y_true, y_pred):
# Compute the training-time loss value and add it
# to the layer using `self.add_loss()`.
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
loss = self.loss_fn(y_true, y_pred, input_length, label_length)
self.add_loss(loss)
# At test time, just return the computed predictions
return y_pred
In[39]:
def build_model():
# Inputs to the model
input_img = layers.Input(
shape=(img_width, img_height, 1), name=“image”, dtype=“float32”
)
labels = layers.Input(name=“label”, shape=(None,), dtype=“float32”)
# First conv block
x = layers.Conv2D(
32,
(3, 3),
activation="relu",
kernel_initializer="he_normal",
padding="same",
name="Conv1",
)(input_img)
x = layers.MaxPooling2D((2, 2), name="pool1")(x)
# Second conv block
x = layers.Conv2D(
64,
(3, 3),
activation="relu",
kernel_initializer="he_normal",
padding="same",
name="Conv2",
)(x)
x = layers.MaxPooling2D((2, 2), name="pool2")(x)
# We have used two max pool with pool size and strides 2.
# Hence, downsampled feature maps are 4x smaller. The number of
# filters in the last layer is 64. Reshape accordingly before
# passing the output to the RNN part of the model
new_shape = ((img_width // 4), (img_height // 4) * 64)
x = layers.Reshape(target_shape=new_shape, name="reshape")(x)
x = layers.Dense(64, activation="relu", name="dense1")(x)
x = layers.Dropout(0.2)(x)
# RNNs
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)
# Output layer
x = layers.Dense(len(characters) + 1, activation="softmax", name="dense2")(x)
# Add CTC layer for calculating CTC loss at each step
output = CTCLayer(name="ctc_loss")(labels, x)
# Define the model
model = keras.models.Model(
inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
)
# Optimizer
opt = keras.optimizers.Adam()
# Compile the model and return
model.compile(optimizer=opt)
return model
In[ ]:
Get the model
#model = build_model()
model=keras.models.load_model(’./OCRM’)
model.summary()
“”"
Training
“”"
epochs = 100
early_stopping_patience = 10
Add early stopping
early_stopping = keras.callbacks.EarlyStopping(
monitor=“val_loss”, patience=early_stopping_patience, restore_best_weights=True
)
checkpoint_filepath = ‘./checkpoint’
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_filepath,
monitor=‘val_loss’,
mode=‘max’,
save_best_only=True)
Train the model
history = model.fit(
train_dataset,
validation_data=validation_dataset,
epochs=epochs,
callbacks=[early_stopping,model_checkpoint_callback],
)
In[ ]:
prediction_model = keras.models.Model(
model.get_layer(name=“image”).input, model.get_layer(name=“dense2”).output
)
prediction_model.summary()
In[ ]:
A utility function to decode the output of the network
def decode_batch_predictions(pred):
input_len = np.ones(pred.shape[0]) * pred.shape[1]
# Use greedy search. For complex tasks, you can use beam search
results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
:, :max_length
]
# Iterate over the results and get back the text
output_text = []
for res in results:
res = tf.strings.reduce_join(num_to_char(res)).numpy().decode(“utf-8”)
output_text.append(res)
return output_text
In[ ]:
test_data_dir=os.listdir("./bwtest")
test_data_dir.sort(key=natural_keys)
x_test=test_data_dir
y_test=y_train[0:10000]
for i in range(len(x_test)):
x_test[i]=f"./bwtest/{i}.png"
x_test=np.array(x_test)
y_test=np.array(y_test)
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_dataset = (
test_dataset.map(
encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
)
.batch(batch_size)
.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)
my_list=[]
for batch in test_dataset:
batch_images = batch[“image”]
batch_labels = batch[“label”]
preds = prediction_model.predict(batch_images)
pred_texts = decode_batch_predictions(preds)
my_list.append(pred_texts)
print(pred_texts)
with open(‘predictions.txt’, ‘a’) as f:
for item in my_list:
f.write("%s\n" % item)
In[ ]:
#m=os.listdir("./bwtrain_copy")
#m.sort(key=natural_keys)
In[ ]:
for batch in validation_dataset.take(1):
batch_images = batch[“image”]
batch_labels = batch[“label”]
preds = prediction_model.predict(batch_images)
pred_texts = decode_batch_predictions(preds)
orig_texts = []
for label in batch_labels:
label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
orig_texts.append(label)
_, ax = plt.subplots(4, 4, figsize=(15, 5))
for i in range(len(pred_texts)):
img = (batch_images[i, :, :, 0] * 255).numpy().astype(np.uint8)
img = img.T
title = f"Prediction: {pred_texts[i]}"
ax[i // 4, i % 4].imshow(img, cmap="gray")
ax[i // 4, i % 4].set_title(title)
ax[i // 4, i % 4].axis("off")
plt.show()