00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017 #ifdef ENABLE_SSH
00018
00019
00021
00022 #include <globalsearch/queueinterfaces/slurm.h>
00023
00024 #include <globalsearch/macros.h>
00025 #include <globalsearch/optimizer.h>
00026 #include <globalsearch/sshconnection.h>
00027 #include <globalsearch/sshmanager.h>
00028 #include <globalsearch/structure.h>
00029
00030 #include <QtCore/QDir>
00031 #include <QtCore/QFile>
00032
00033 namespace GlobalSearch {
00034
00035 SlurmQueueInterface::SlurmQueueInterface(OptBase *parent,
00036 const QString &settingsFile) :
00037 RemoteQueueInterface(parent, settingsFile),
00038 m_squeue("squeue"),
00039 m_sbatch("sbatch"),
00040 m_scancel("scancel"),
00041 m_interval(1),
00042 m_cleanRemoteOnStop(false)
00043 {
00044 m_idString = "SLURM";
00045 m_templates.append("job.slurm");
00046 m_hasDialog = true;
00047
00048 readSettings(settingsFile);
00049 }
00050
00051 SlurmQueueInterface::~SlurmQueueInterface()
00052 {
00053 }
00054
00055 bool SlurmQueueInterface::isReadyToSearch(QString *str)
00056 {
00057
00058 if (m_opt->filePath.isEmpty()) {
00059 *str = tr("Local working directory is not set. Check your Queue "
00060 "configuration.");
00061 return false;
00062 }
00063
00064
00065 QDir workingdir (m_opt->filePath);
00066 bool writable = true;
00067 if (!workingdir.exists()) {
00068 if (!workingdir.mkpath(m_opt->filePath)) {
00069 writable = false;
00070 }
00071 }
00072 else {
00073
00074 QString filename = m_opt->filePath + QString("queuetest-")
00075 + QString::number(RANDUINT());
00076 QFile file (filename);
00077 if (!file.open(QFile::ReadWrite)) {
00078 writable = false;
00079 }
00080 file.remove();
00081 }
00082 if (!writable) {
00083 *str = tr("Cannot write to working directory '%1'.\n\nPlease "
00084 "change the permissions on this directory or specify "
00085 "a different one in the Queue configuration.")
00086 .arg(m_opt->filePath);
00087 return false;
00088 }
00089
00090
00091 if (m_opt->host.isEmpty()) {
00092 *str = tr("Hostname of SLURM server is not set. Check your Queue "
00093 "configuration.");
00094 return false;
00095 }
00096
00097 if (m_scancel.isEmpty()) {
00098 *str = tr("scancel command is not set. Check your Queue "
00099 "configuration.");
00100 return false;
00101 }
00102
00103 if (m_squeue.isEmpty()) {
00104 *str = tr("squeue command is not set. Check your Queue "
00105 "configuration.");
00106 return false;
00107 }
00108
00109 if (m_sbatch.isEmpty()) {
00110 *str = tr("sbatch command is not set. Check your Queue "
00111 "configuration.");
00112 return false;
00113 }
00114
00115 if (m_opt->rempath.isEmpty()) {
00116 *str = tr("Remote working directory is not set. Check your Queue "
00117 "configuration.");
00118 return false;
00119 }
00120
00121 if (m_opt->username.isEmpty()) {
00122 *str = tr("SSH username for SLURM server is not set. Check your Queue "
00123 "configuration.");
00124 return false;
00125 }
00126
00127 if (m_opt->port < 0) {
00128 *str = tr("SSH port is invalid (Port %1). Check your Queue "
00129 "configuration.").arg(m_opt->port);
00130 return false;
00131 }
00132
00133 *str = "";
00134 return true;
00135 }
00136
00137 QDialog* SlurmQueueInterface::dialog()
00138 {
00139 if (!m_dialog) {
00140 m_dialog = new SlurmConfigDialog (m_opt->dialog(),
00141 m_opt,
00142 this);
00143 }
00144 SlurmConfigDialog *d = qobject_cast<SlurmConfigDialog*>(m_dialog);
00145 d->updateGUI();
00146
00147 return d;
00148 }
00149
00150 void SlurmQueueInterface::readSettings(const QString &filename)
00151 {
00152 SETTINGS(filename);
00153
00154 settings->beginGroup(m_opt->getIDString().toLower());
00155 settings->beginGroup("queueinterface/slurmqueueinterface");
00156 int loadedVersion = settings->value("version", 0).toInt();
00157 settings->beginGroup("paths");
00158
00159 m_sbatch = settings->value("sbatch", "sbatch").toString();
00160 m_squeue = settings->value("squeue", "squeue").toString();
00161 m_scancel = settings->value("scancel", "scancel").toString();
00162 this->setInterval(settings->value("interval", 1).toInt());
00163 m_cleanRemoteOnStop = settings->value("cleanRemoteOnStop", false).toBool();
00164
00165 settings->endGroup();
00166 settings->endGroup();
00167 settings->endGroup();
00168
00169 DESTROY_SETTINGS(filename);
00170
00171
00172 switch (loadedVersion) {
00173 case 0:
00174 default:
00175 break;
00176 }
00177
00178 }
00179
00180 void SlurmQueueInterface::writeSettings(const QString &filename)
00181 {
00182 SETTINGS(filename);
00183
00184 const int VERSION = 0;
00185
00186 settings->beginGroup(m_opt->getIDString().toLower());
00187 settings->beginGroup("queueinterface/slurmqueueinterface");
00188 settings->setValue("version", VERSION);
00189 settings->beginGroup("paths");
00190
00191 settings->setValue("sbatch", m_sbatch);
00192 settings->setValue("squeue", m_squeue);
00193 settings->setValue("scancel", m_scancel);
00194 settings->setValue("interval", m_interval);
00195 settings->setValue("cleanRemoteOnStop", m_cleanRemoteOnStop);
00196
00197 settings->endGroup();
00198 settings->endGroup();
00199 settings->endGroup();
00200
00201 DESTROY_SETTINGS(filename);
00202 }
00203
00204 bool SlurmQueueInterface::startJob(Structure *s)
00205 {
00206 SSHConnection *ssh = m_opt->ssh()->getFreeConnection();
00207
00208 if (ssh == NULL) {
00209 m_opt->warning(tr("Cannot connect to ssh server"));
00210 return false;
00211 }
00212
00213 QWriteLocker wlocker (s->lock());
00214
00215 QString command = "cd \"" + s->getRempath() + "\" && " +
00216 m_sbatch + " job.slurm";
00217
00218 QString stdout_str;
00219 QString stderr_str;
00220 int ec;
00221 if (!ssh->execute(command, stdout_str, stderr_str, ec) || ec != 0) {
00222 m_opt->warning(tr("Error executing %1: %2")
00223 .arg(command).arg(stderr_str));
00224 m_opt->ssh()->unlockConnection(ssh);
00225 return false;
00226 }
00227 m_opt->ssh()->unlockConnection(ssh);
00228
00229
00230 QStringList list = stdout_str.split(QRegExp("\\s+"),
00231 QString::SkipEmptyParts);
00232 bool ok = false;
00233 unsigned int jobID;
00234 if (list.size() >= 4) {
00235 jobID = list.at(3).toUInt(&ok);
00236 }
00237
00238 if (!ok) {
00239 m_opt->warning(tr("Error retrieving jobID for structure %1.")
00240 .arg(s->getIDString()));
00241 return false;
00242 }
00243
00244 s->setJobID(jobID);
00245 s->startOptTimer();
00246 return true;
00247 }
00248
00249 bool SlurmQueueInterface::stopJob(Structure *s)
00250 {
00251 SSHConnection *ssh = m_opt->ssh()->getFreeConnection();
00252
00253 if (ssh == NULL) {
00254 m_opt->warning(tr("Cannot connect to ssh server"));
00255 return false;
00256 }
00257
00258
00259 QWriteLocker locker (s->lock());
00260
00261
00262 if (s->getJobID() == 0) {
00263 if (m_cleanRemoteOnStop) {
00264 this->cleanRemoteDirectory(s, ssh);
00265 }
00266 m_opt->ssh()->unlockConnection(ssh);
00267 return true;
00268 }
00269
00270 const QString command = m_scancel + " " + QString::number(s->getJobID());
00271
00272
00273 QString stdout_str;
00274 QString stderr_str;
00275 int ec;
00276 bool ret = true;
00277 if (!ssh->execute(command, stdout_str, stderr_str, ec) || ec != 0) {
00278
00279 ret = false;
00280 }
00281
00282 s->setJobID(0);
00283 s->stopOptTimer();
00284 m_opt->ssh()->unlockConnection(ssh);
00285 return ret;
00286 }
00287
00288 QueueInterface::QueueStatus SlurmQueueInterface::getStatus(Structure *s) const
00289 {
00290
00291 QWriteLocker locker (s->lock());
00292 QStringList queueData = getQueueList();
00293 unsigned int jobID = static_cast<unsigned int>(s->getJobID());
00294
00295
00296
00297 if (queueData.size() == 1 && queueData[0].compare("CommError") == 0) {
00298 return QueueInterface::CommunicationError;
00299 }
00300
00301
00302 if (!jobID && s->getStatus() != Structure::Submitted) {
00303 return QueueInterface::Error;
00304 }
00305
00306
00307 QString status;
00308 QStringList entryList;
00309 unsigned int curJobID = 0;
00310 bool ok;
00311 for (int i = 0; i < queueData.size(); ++i) {
00312 entryList = queueData[i].split(QRegExp("\\s+"),
00313 QString::SkipEmptyParts);
00314
00315 if (entryList.size() > 5) {
00316 curJobID = entryList.first().toUInt(&ok);
00317 if (!ok) {
00318 continue;
00319 }
00320 }
00321 else {
00322 continue;
00323 }
00324 if (curJobID == jobID) {
00325 status = entryList.at(4);
00326 break;
00327 }
00328 }
00329
00330
00331
00332
00333
00334
00335
00336
00337
00338 if (s->getStatus() == Structure::Submitted) {
00339
00340 if (status.isEmpty()) {
00341
00342 bool exists;
00343 if (!m_opt->optimizer()->checkIfOutputFileExists(s, &exists)) {
00344 return QueueInterface::CommunicationError;
00345 }
00346 if (!exists) {
00347
00348 return QueueInterface::Pending;
00349 }
00350 else {
00351
00352 return QueueInterface::Started;
00353 }
00354 }
00355 else {
00356
00357 return QueueInterface::Started;
00358 }
00359 }
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391 if (status.contains(QRegExp("CA|CD|CG|F|NF|R|S|TO"))) {
00392 return QueueInterface::Running;
00393 }
00394 else if (status.contains(QRegExp("CF|PD"))) {
00395 return QueueInterface::Queued;
00396 }
00397 else if (status.isEmpty()) {
00398 locker.unlock();
00399 bool outputFileExists;
00400 if (!m_opt->optimizer()->checkIfOutputFileExists(s, &outputFileExists) ) {
00401 return QueueInterface::CommunicationError;
00402 }
00403 locker.relock();
00404
00405 if (outputFileExists) {
00406
00407 bool success;
00408 if (!m_opt->optimizer()->checkForSuccessfulOutput(s, &success)) {
00409 return QueueInterface::CommunicationError;
00410 }
00411 if (success) {
00412 return QueueInterface::Success;
00413 }
00414 else {
00415 return QueueInterface::Error;
00416 }
00417 }
00418
00419
00420
00421
00422
00423 m_opt->debug(tr("Structure %1 with jobID %2 is missing "
00424 "from the queue and has not written any output.")
00425 .arg(s->getIDString()).arg(s->getJobID()));
00426 return QueueInterface::Error;
00427 }
00428
00429 else {
00430 m_opt->debug(tr("Structure %1 with jobID %2 has "
00431 "unrecognized status: %3")
00432 .arg(s->getIDString()).arg(s->getJobID())
00433 .arg(status));
00434 return QueueInterface::Unknown;
00435 }
00436 }
00437
00438 void SlurmQueueInterface::setInterval(const int sec)
00439 {
00440 m_queueMutex.lockForWrite();
00441 m_interval = sec;
00442 m_queueMutex.unlock();
00443 }
00444
00445 QStringList SlurmQueueInterface::getQueueList() const
00446 {
00447
00448 QReadWriteLock &queueMutex = const_cast<QReadWriteLock&> (m_queueMutex);
00449
00450 queueMutex.lockForRead();
00451
00452
00453 if (m_queueTimeStamp.isValid() &&
00454
00455 #if QT_VERSION >= 0x040700
00456 m_queueTimeStamp.msecsTo(QDateTime::currentDateTime())
00457 <= 1000*m_interval
00458 #else
00459
00460
00461 (m_queueTimeStamp.date() == QDate::currentDate() &&
00462 m_queueTimeStamp.time().msecsTo(QTime::currentTime())
00463 <= 1000*m_interval)
00464 #endif
00465 ) {
00466
00467 QStringList ret (m_queueData);
00468 queueMutex.unlock();
00469 return ret;
00470 }
00471
00472
00473
00474 QDateTime oldTimeStamp (m_queueTimeStamp);
00475 queueMutex.unlock();
00476
00477
00478 QWriteLocker queueLocker (&queueMutex);
00479
00480
00481
00482
00483
00484 if (m_queueTimeStamp.time().msecsTo(oldTimeStamp.time()) != 0) {
00485 queueLocker.unlock();
00486 return this->getQueueList();
00487 }
00488
00489
00490
00491 QStringList &queueData = const_cast<QStringList&> (m_queueData);
00492 QDateTime &queueTimeStamp = const_cast<QDateTime&> (m_queueTimeStamp);
00493
00494
00495 SSHConnection *ssh = m_opt->ssh()->getFreeConnection();
00496
00497 if (ssh == NULL) {
00498 m_opt->warning(tr("Cannot connect to ssh server"));
00499 queueTimeStamp = QDateTime::currentDateTime();
00500 queueData.clear();
00501 queueData << "CommError";
00502 QStringList ret (m_queueData);
00503 return ret;
00504 }
00505
00506 QString command = m_squeue + " | grep " + m_opt->username;
00507
00508
00509 QString stdout_str;
00510 QString stderr_str;
00511 int ec;
00512
00513
00514
00515 if (!ssh->execute(command, stdout_str, stderr_str, ec)
00516 || (ec != 0 && ec != 1 )
00517 ) {
00518 m_opt->ssh()->unlockConnection(ssh);
00519 m_opt->warning(tr("Error executing %1: (%2) %3\n\t"
00520 "Using cached queue data.")
00521 .arg(command)
00522 .arg(QString::number(ec))
00523 .arg(stderr_str));
00524 queueTimeStamp = QDateTime::currentDateTime();
00525 QStringList ret (m_queueData);
00526 return ret;
00527 }
00528 m_opt->ssh()->unlockConnection(ssh);
00529
00530 queueData = stdout_str.split("\n", QString::SkipEmptyParts);
00531
00532 QStringList ret (m_queueData);
00533 queueTimeStamp = QDateTime::currentDateTime();
00534 return ret;
00535 }
00536
00537 }
00538
00540
00541 #endif // ENABLE_SSH