Solved: Getting the error below. Please help

ffa · ‎11-21-2022

flower_detection_job.sh.e679750

Failed to detect engines! (No such file or directory)
(Kernel 4.16 or newer is required for i915 PMU support.)
Traceback (most recent call last):
File "/home/u177703/UFP/train.py", line 2, in <module>
import numpy as np
ImportError: No module named numpy
/usr/sbin/kill-illegit-procs: line 86: kill: (756605) - No such process
/usr/sbin/kill-illegit-procs: line 86: kill: (756610) - No such process
/usr/sbin/kill-illegit-procs: line 86: kill: (756629) - No such process
kill: usage: kill [-s sigspec | -n signum | -sigspec] pid | jobspec ... or kill -l [sigspec]

The one on the first line and also the import one are the errors.

I used the following command to submit the job.

qsub -l nodes=1:idc084 flower_detection_job.sh

flower_detection_job.sh

HOME_DIR=$HOME
python ${HOME_DIR}/UFP/train.py ${HOME_DIR}/UFP --arch=resnet18 --gpu --epochs=5

train.py

import argparse
import numpy as np
import torch
from torch import nn, optim
from torchvision import datasets, models, transforms
from PIL import Image
import sys
import warnings
warnings.filterwarnings("ignore")

parser = argparse.ArgumentParser(description = "Train a new neural network on a dataset.")

parser.add_argument("data_dir", type = str, help = "Dataset for the network to train on.")

parser.add_argument("--arch", type = str, default = "resnet18", 
help = "Available architectures: resnet18, vgg13")

parser.add_argument("--epochs", type = int, default = 10, 
help = "Number of epochs.")

parser.add_argument("--gpu", action = "store_true", help = "Train on a GPU device.")

parser.add_argument("--hidden_units", type = int, default = 256,
help = "Number of hidden units.")

parser.add_argument("--learning_rate", type = float, default = 0.003,
help = "Learning rate to use for the model.")

parser.add_argument("--save_dir", type = str, default = "./",
help = "Location to save your model after training.")

args_in = parser.parse_args()


# if args_in.gpu:
# try:
# assert torch.cuda.is_available() == True
# device = "cuda"
# print("Using CUDA..")
# except AssertionError:
# answer = input("GPU is not available on this device, use CPU? (yes, no): ")

# if answer.lower() == "yes":
# device = "cpu"
# print("Using CPU..")
# elif answer.lower() == "no": 
# print("Terminating..")
# sys.exit()
# else:
# print("Invalid option selected, terminating..")
# sys.exit()
# else:
# device = "cpu"
# print("Using CPU..")

# print("Loading data..")

device = "cuda" if torch.cuda.is_available() else "cpu"

data_dir = args_in.data_dir[:-1] if args_in.data_dir[-1] == "/" else args_in.data_dir
train_dir = data_dir + '/train'
valid_dir = data_dir + '/valid'
test_dir = data_dir + '/test'

train_transforms = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomRotation(30),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])])
val_test_transforms = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])])

train_dataset = datasets.ImageFolder(train_dir, transform = train_transforms)
val_dataset = datasets.ImageFolder(valid_dir, transform = val_test_transforms)
test_dataset = datasets.ImageFolder(test_dir, transform = val_test_transforms)

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size = 64, shuffle = True)
valloader = torch.utils.data.DataLoader(val_dataset, batch_size = 64, shuffle = True)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size = 64, shuffle = True)

print("Building model..")


if args_in.arch == "resnet18":

model = models.resnet18(pretrained=True)

for params in model.parameters():
params.requires_grad = False

classifier = nn.Sequential(
nn.Linear(512, args_in.hidden_units),
nn.ReLU(),
nn.Dropout(p=0.25),
nn.Linear(args_in.hidden_units, 102),
nn.LogSoftmax(dim=1)
)

model.fc = classifier
optimizer = optim.Adam(model.fc.parameters(), lr=args_in.learning_rate)

elif args_in.arch == "densenet161":

model = models.densenet161(pretrained=True)

for params in model.parameters():
params.requires_grad = False

classifier = nn.Sequential(
nn.Linear(2208, args_in.hidden_units),
nn.ReLU(),
nn.Dropout(p=0.25),
nn.Linear(args_in.hidden_units, 102),
nn.LogSoftmax(dim=1)
)

model.classifier = classifier
optimizer = optim.Adam(model.classifier.parameters(), lr=args_in.learning_rate)

elif args_in.arch == "alexnet":

model = models.alexnet(pretrained=True)

for params in model.parameters():
params.requires_grad = False

classifier = nn.Sequential(
nn.Linear(9216, args_in.hidden_units),
nn.ReLU(),
nn.Dropout(p=0.25),
nn.Linear(args_in.hidden_units, 102),
nn.LogSoftmax(dim=1)
)

model.classifier = classifier
optimizer = optim.Adam(model.classifier.parameters(), lr=args_in.learning_rate)

else:
print("Architecture is not available!")
sys.exit()

criterion = nn.NLLLoss()
epochs = args_in.epochs
steps = 0
train_losses, test_losses = [], []
running_loss = 0
print_every = 30

model.to(device);

print("Training model..")

for e in range(epochs):
for images, labels in trainloader:

steps += 1

images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()

logps = model(images)
loss = criterion(logps, labels)
loss.backward()
optimizer.step()

running_loss += loss.item()

if steps % print_every == 0:
test_loss = 0
accuracy = 0
model.eval()

with torch.no_grad():
for images, labels in valloader:

images, labels = images.to(device), labels.to(device)

logps = model(images)
loss = criterion(logps, labels)
test_loss += loss.item()

ps = torch.exp(logps)
top_p, top_class = ps.topk(1, dim=1)
equals = top_class == labels.view(*top_class.shape)
accuracy += torch.mean(equals.type(torch.FloatTensor))

model.train()
train_losses.append(running_loss/print_every)
test_losses.append(test_loss/len(valloader))

print("Epochs: {}/ {}..".format(e+1, epochs),
"Train loss: {:.3f}..".format(running_loss/print_every),
"Test loss: {:.3f}..".format(test_loss/len(valloader)),
"Accuracy: {:.3f}..".format(accuracy/len(valloader)))

running_loss = 0
print("Model trained") 

print("Testing data..") 
with torch.no_grad():

accuracy = 0
model.eval()
for (images, labels) in testloader:

(images, labels) = (images.to(device), labels.to(device))

logps = model(images)
loss = criterion(logps, labels)

ps = torch.exp(logps)
(top_p, top_class) = ps.topk(1, dim=1)
equals = top_class == labels.view(*top_class.shape)
accuracy += torch.mean(equals.type(torch.FloatTensor))

print("Accuracy on test data: {}".format(accuracy/len(testloader)))

print("Saving model..")

model.class_to_idx = train_dataset.class_to_idx
checkpoint = {
'epochs': epochs,
'label_mapping': model.class_to_idx,
'model_arch': args_in.arch,
'hidden_units': args_in.hidden_units,
'model_state_dict': model.state_dict(),
'optim_state_dict': optimizer.state_dict()
}

save_dir = args_in.save_dir[:-1] if args_in.data_dir[-1] == "/" else args_in.save_dir
torch.save(checkpoint, save_dir + '/checkpoint-' + args_in.arch + '.pth')

print("Model saved successfully.")

JesusE_Intel · ‎11-23-2022

Hi ffa,

To reset the Jupyter environment on Intel Developer for the Edge run the following commands in your home directory. Once complete, log out by clicking File -> Log Out. The next time you log in, the reference samples will be generated again. Please ensure you back up any files/changes under reference samples as this is not reversible.

rm -r Reference-samples/

rm ~/.jupyter/.staging_complete

Regards,

Jesus

View solution in original post

RemyaP_Intel · ‎11-21-2022

Hi,

Thank you for posting in Intel Communities.

Could you please tell us which DevCloud you are using? Is it DevCloud for OneAPI/Edge/FPGA?

Regards,

Remya Premdas

ffa · ‎11-22-2022

oneapi

RemyaP_Intel · ‎11-23-2022

Hi,

Could you please check again if that is OneAPI Devcloud. Looking at the screenshot and the node to which you have connected, it doesn't seems to be OneAPI Devcloud. Please confirm.

Regards,

Remya Premdas

ffa · ‎11-23-2022

Yeah sorry, it is edge. I want to reset both of them Edge as well as OneApi. I have backed up my data.

JesusE_Intel · ‎11-23-2022

Hi ffa,

To reset the Jupyter environment on Intel Developer for the Edge run the following commands in your home directory. Once complete, log out by clicking File -> Log Out. The next time you log in, the reference samples will be generated again. Please ensure you back up any files/changes under reference samples as this is not reversible.

rm -r Reference-samples/

rm ~/.jupyter/.staging_complete

Regards,

Jesus

JesusE_Intel · ‎11-28-2022

If you need any additional information, please submit a new question as this thread will no longer be monitored.