Intel® DevCloud
Help for those needing help starting or connecting to the Intel® DevCloud
1514 Discussions

Getting the error below. Please help

ffa
Novice
752 Views

flower_detection_job.sh.e679750

Failed to detect engines! (No such file or directory)
(Kernel 4.16 or newer is required for i915 PMU support.)
Traceback (most recent call last):
File "/home/u177703/UFP/train.py", line 2, in <module>
import numpy as np
ImportError: No module named numpy
/usr/sbin/kill-illegit-procs: line 86: kill: (756605) - No such process
/usr/sbin/kill-illegit-procs: line 86: kill: (756610) - No such process
/usr/sbin/kill-illegit-procs: line 86: kill: (756629) - No such process
kill: usage: kill [-s sigspec | -n signum | -sigspec] pid | jobspec ... or kill -l [sigspec]

 

The one on the first line and also the import one are the errors.

 

I used the following command to submit the job.

qsub -l nodes=1:idc084 flower_detection_job.sh

 

flower_detection_job.sh

HOME_DIR=$HOME
python ${HOME_DIR}/UFP/train.py ${HOME_DIR}/UFP --arch=resnet18 --gpu --epochs=5

 

train.py

import argparse
import numpy as np
import torch
from torch import nn, optim
from torchvision import datasets, models, transforms
from PIL import Image
import sys
import warnings
warnings.filterwarnings("ignore")

parser = argparse.ArgumentParser(description = "Train a new neural network on a dataset.")

parser.add_argument("data_dir", type = str, help = "Dataset for the network to train on.")

parser.add_argument("--arch", type = str, default = "resnet18",
help = "Available architectures: resnet18, vgg13")

parser.add_argument("--epochs", type = int, default = 10,
help = "Number of epochs.")

parser.add_argument("--gpu", action = "store_true", help = "Train on a GPU device.")

parser.add_argument("--hidden_units", type = int, default = 256,
help = "Number of hidden units.")

parser.add_argument("--learning_rate", type = float, default = 0.003,
help = "Learning rate to use for the model.")

parser.add_argument("--save_dir", type = str, default = "./",
help = "Location to save your model after training.")

args_in = parser.parse_args()


# if args_in.gpu:
# try:
# assert torch.cuda.is_available() == True
# device = "cuda"
# print("Using CUDA..")
# except AssertionError:
# answer = input("GPU is not available on this device, use CPU? (yes, no): ")

# if answer.lower() == "yes":
# device = "cpu"
# print("Using CPU..")
# elif answer.lower() == "no":
# print("Terminating..")
# sys.exit()
# else:
# print("Invalid option selected, terminating..")
# sys.exit()
# else:
# device = "cpu"
# print("Using CPU..")

# print("Loading data..")

device = "cuda" if torch.cuda.is_available() else "cpu"

data_dir = args_in.data_dir[:-1] if args_in.data_dir[-1] == "/" else args_in.data_dir
train_dir = data_dir + '/train'
valid_dir = data_dir + '/valid'
test_dir = data_dir + '/test'

train_transforms = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomRotation(30),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])])
val_test_transforms = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])])

train_dataset = datasets.ImageFolder(train_dir, transform = train_transforms)
val_dataset = datasets.ImageFolder(valid_dir, transform = val_test_transforms)
test_dataset = datasets.ImageFolder(test_dir, transform = val_test_transforms)

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size = 64, shuffle = True)
valloader = torch.utils.data.DataLoader(val_dataset, batch_size = 64, shuffle = True)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size = 64, shuffle = True)

print("Building model..")


if args_in.arch == "resnet18":

model = models.resnet18(pretrained=True)

for params in model.parameters():
params.requires_grad = False

classifier = nn.Sequential(
nn.Linear(512, args_in.hidden_units),
nn.ReLU(),
nn.Dropout(p=0.25),
nn.Linear(args_in.hidden_units, 102),
nn.LogSoftmax(dim=1)
)

model.fc = classifier
optimizer = optim.Adam(model.fc.parameters(), lr=args_in.learning_rate)

elif args_in.arch == "densenet161":

model = models.densenet161(pretrained=True)

for params in model.parameters():
params.requires_grad = False

classifier = nn.Sequential(
nn.Linear(2208, args_in.hidden_units),
nn.ReLU(),
nn.Dropout(p=0.25),
nn.Linear(args_in.hidden_units, 102),
nn.LogSoftmax(dim=1)
)

model.classifier = classifier
optimizer = optim.Adam(model.classifier.parameters(), lr=args_in.learning_rate)

elif args_in.arch == "alexnet":

model = models.alexnet(pretrained=True)

for params in model.parameters():
params.requires_grad = False

classifier = nn.Sequential(
nn.Linear(9216, args_in.hidden_units),
nn.ReLU(),
nn.Dropout(p=0.25),
nn.Linear(args_in.hidden_units, 102),
nn.LogSoftmax(dim=1)
)

model.classifier = classifier
optimizer = optim.Adam(model.classifier.parameters(), lr=args_in.learning_rate)

else:
print("Architecture is not available!")
sys.exit()

criterion = nn.NLLLoss()
epochs = args_in.epochs
steps = 0
train_losses, test_losses = [], []
running_loss = 0
print_every = 30

model.to(device);

print("Training model..")

for e in range(epochs):
for images, labels in trainloader:

steps += 1

images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()

logps = model(images)
loss = criterion(logps, labels)
loss.backward()
optimizer.step()

running_loss += loss.item()

if steps % print_every == 0:
test_loss = 0
accuracy = 0
model.eval()

with torch.no_grad():
for images, labels in valloader:

images, labels = images.to(device), labels.to(device)

logps = model(images)
loss = criterion(logps, labels)
test_loss += loss.item()

ps = torch.exp(logps)
top_p, top_class = ps.topk(1, dim=1)
equals = top_class == labels.view(*top_class.shape)
accuracy += torch.mean(equals.type(torch.FloatTensor))

model.train()
train_losses.append(running_loss/print_every)
test_losses.append(test_loss/len(valloader))

print("Epochs: {}/ {}..".format(e+1, epochs),
"Train loss: {:.3f}..".format(running_loss/print_every),
"Test loss: {:.3f}..".format(test_loss/len(valloader)),
"Accuracy: {:.3f}..".format(accuracy/len(valloader)))

running_loss = 0
print("Model trained")

print("Testing data..")
with torch.no_grad():

accuracy = 0
model.eval()
for (images, labels) in testloader:

(images, labels) = (images.to(device), labels.to(device))

logps = model(images)
loss = criterion(logps, labels)

ps = torch.exp(logps)
(top_p, top_class) = ps.topk(1, dim=1)
equals = top_class == labels.view(*top_class.shape)
accuracy += torch.mean(equals.type(torch.FloatTensor))

print("Accuracy on test data: {}".format(accuracy/len(testloader)))

print("Saving model..")

model.class_to_idx = train_dataset.class_to_idx
checkpoint = {
'epochs': epochs,
'label_mapping': model.class_to_idx,
'model_arch': args_in.arch,
'hidden_units': args_in.hidden_units,
'model_state_dict': model.state_dict(),
'optim_state_dict': optimizer.state_dict()
}

save_dir = args_in.save_dir[:-1] if args_in.data_dir[-1] == "/" else args_in.save_dir
torch.save(checkpoint, save_dir + '/checkpoint-' + args_in.arch + '.pth')

print("Model saved successfully.")

 

0 Kudos
1 Solution
JesusE_Intel
Moderator
687 Views

Hi ffa,


To reset the Jupyter environment on Intel Developer for the Edge run the following commands in your home directory. Once complete, log out by clicking File -> Log Out. The next time you log in, the reference samples will be generated again. Please ensure you back up any files/changes under reference samples as this is not reversible.


rm -r Reference-samples/

rm ~/.jupyter/.staging_complete


Regards,

Jesus


View solution in original post

0 Kudos
6 Replies
RemyaP_Intel
Moderator
723 Views

Hi,


Thank you for posting in Intel Communities.


Could you please tell us which DevCloud you are using? Is it DevCloud for OneAPI/Edge/FPGA?


Regards,

Remya Premdas


0 Kudos
RemyaP_Intel
Moderator
698 Views

Hi,


Could you please check again if that is OneAPI Devcloud. Looking at the screenshot and the node to which you have connected, it doesn't seems to be OneAPI Devcloud. Please confirm.


Regards,

Remya Premdas


0 Kudos
ffa
Novice
695 Views
Yeah sorry, it is edge. I want to reset both of them Edge as well as OneApi. I have backed up my data.
0 Kudos
JesusE_Intel
Moderator
688 Views

Hi ffa,


To reset the Jupyter environment on Intel Developer for the Edge run the following commands in your home directory. Once complete, log out by clicking File -> Log Out. The next time you log in, the reference samples will be generated again. Please ensure you back up any files/changes under reference samples as this is not reversible.


rm -r Reference-samples/

rm ~/.jupyter/.staging_complete


Regards,

Jesus


0 Kudos
JesusE_Intel
Moderator
659 Views

If you need any additional information, please submit a new question as this thread will no longer be monitored.


0 Kudos
Reply