The Challenge¶
Maintaining a healthy diet is difficult. As the saying goes, the best way to escape a problem is to solve it. So why not leverage the power of deep learning and computer vision to build the foundation of a semi-automated food tracking application?
With over 9300
hand-annotated images with 61
classes, the challenge is to train accurate models that can look at images of food items and detect the food items present in the image.
It's time to unleash the food (data)scientist in you! Given any image
, identify
the food
item present in it.
Downloads and Installs¶
!wget -q https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/foodc/v0.1/train_images.zip
!wget -q https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/foodc/v0.1/test_images.zip
!wget -q https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/foodc/v0.1/train.csv
!wget -q https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/foodc/v0.1/test.csv
!mkdir data
!mkdir data/test
!mkdir data/train
!unzip train_images -d data/train
!unzip test_images -d data/test
!mkdir models
Imports¶
import sys
import os
import gc
import warnings
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import torch.nn.functional as F
from fastai.script import *
from fastai.vision import *
from fastai.callbacks import *
from fastai.distributed import *
from fastprogress import fastprogress
from torchvision.models import *
np.random.seed(23)
torch.cuda.device(0)
warnings.filterwarnings("ignore")
torch.multiprocessing.freeze_support()
print("[INFO] GPU:", torch.cuda.get_device_name())
DataBunch and Model¶
Here we use a technique called progressive resizing. At each step the model is loaded with weights trained on images of lower sizes.
def get_data(size, batch_size):
"""
function that returns a DataBunch as needed for the Learner
"""
train = pd.read_csv("train.csv")
src = (ImageList.from_df(train, path="data/", folder="train/train_images/").split_by_rand_pct(0.1).label_from_df())
src.add_test_folder("test/test_images/")
tfms = get_transforms(do_flip=True, flip_vert=False, max_rotate=10.0,
max_zoom=1.1, max_lighting=0.2, max_warp=0.2, p_affine=0.75, p_lighting=0.75)
data = (src.transform(
tfms,
size=size,
resize_method=ResizeMethod.SQUISH)
.databunch(bs=batch_size)
.normalize(imagenet_stats))
assert sorted(set(train.ClassName.unique())) == sorted(data.classes), "Class Mismatch"
print("[INFO] Number of Classes: ", data.c)
data.num_workers = 4
return data
sample_data = get_data(32, (2048//32))
sample_data.show_batch(3, 3)
As you can see, the transforms have been applied and the image is normalized as well!
We first initialize all the models.
learn = create_cnn(get_data(32, (2048//32)), models.densenet161,
metrics=[accuracy, FBeta(beta=1,average='macro')])
learn.model_dir = "models/"
learn.save("densenet_32")
learn = create_cnn(get_data(64, (2048//64)), models.densenet161,
metrics=[accuracy, FBeta(beta=1,average='macro')]).load("densenet_32")
learn.model_dir = "models/"
learn.save("densenet_64")
learn = create_cnn(get_data(128, (2048//128)), models.densenet161,
metrics=[accuracy, FBeta(beta=1,average='macro')]).load("densenet_64")
learn.model_dir = "models/"
learn.save("densenet_128")
learn = create_cnn(get_data(256, (2048//256)), models.densenet161,
metrics=[accuracy, FBeta(beta=1,average='macro')]).load("densenet_128")
learn.model_dir = "models/"
learn.save("densenet_256")
def train_model(size, iter1, iter2, mixup=False):
"""
function to quickly train a model for a certain number of iterations.
"""
size_match = {"256": "128", "128": "64", "64": "32"}
learn = create_cnn(get_data(size, (2048//size)), models.densenet161,
metrics=[accuracy,
FBeta(beta=1,average='macro')])
learn.model_dir = "models/"
if mixup:
learn.mixup()
if str(size) != str(32):
learn.load("densenet_" + str(size_match[str(size)]))
name = "densenet_" + str(size)
print("[INFO] Training for : ", name)
learn.fit_one_cycle(iter1, 1e-4, callbacks=[ShowGraph(learn),
SaveModelCallback(learn, monitor='f_beta', mode='max', name=name)])
learn.unfreeze()
learn.fit_one_cycle(iter2, 5e-5, callbacks=[ShowGraph(learn),
SaveModelCallback(learn, monitor='f_beta', mode='max', name=name)])
Here you might notice the use of a function mixup
. mixup
is a callback in fastai
that is extremely efficient at regularizing models in computer vision.
Instead of feeding the model the raw images, we take two images (not necessarily from the same class) and make a linear combination of them: in terms of tensors, we have:
new_image = t * image1 + (1-t) * image2
where t is a float between 0 and 1. The target we assign to that new image is the same combination of the original targets:
new_target = t * target1 + (1-t) * target2
assuming the targets are one-hot encoded (which isn’t the case in PyTorch usually). And it's as simple as that.
For example:
Source Dog or cat? The right answer here is 70% dog and 30% cat!train_model(32, 5, 3)
train_model(64, 5, 4)
train_model(128, 7, 4, mixup=True)
train_model(256, 7, 5, mixup=True)
learn = create_cnn(get_data(300, (2048//300)), models.densenet161,
metrics=[accuracy, FBeta(beta=1,average='macro')]).load("densenet_256")
learn.model_dir = "models/"
learn.mixup()
size = 300
name = "densenet_" + str(size)
print("[INFO] Training for : ", name)
learn.fit_one_cycle(5, 1e-4, callbacks=[ShowGraph(learn),
SaveModelCallback(learn, monitor='f_beta', mode='max', name=name)])
learn.load("densenet_300")
interp = ClassificationInterpretation.from_learner(learn)
losses, idxs = interp.top_losses()
display(interp.plot_top_losses(9, figsize=(15,11)))
display(interp.plot_confusion_matrix(figsize=(12,12), dpi=100))
print("[INFO] MOST CONFUSED:")
interp.most_confused(min_val=5)
The model is getting confused between some very common categories like coffee-with-caffeine
and espresso-with-caffeine
.
The model needs to be made more robust to this and hence appropriate augmentations can be used.
def make_submission(learn, name):
images = []
prediction = []
probability = []
test_path = "data/test/test_images/"
test = pd.read_csv("test.csv")
files = test.ImageId
for i in files:
images.append(i)
img = open_image(os.path.join(test_path, i))
pred_class, pred_idx, outputs = learn.predict(img)
prediction.append(pred_class.obj)
probability.append(outputs.abs().max().item())
answer = pd.DataFrame({'ImageId': images, 'ClassName': prediction, 'probability': probability})
display(answer.head())
answer[["ImageId","ClassName"]].to_csv(name, index=False)
make_submission(learn, name="submission_size300.csv")
Improving Further¶
- Appropriate augmnentations
- Different models like
densenet201
,resnet50
- Mixed Precision training (i.e.
to_fp16()
in fastai)