@@ -2,6 +2,7 @@ package repo
import (
"bufio"
"code.gitea.io/gitea/services/lock"
"encoding/json"
"errors"
"fmt"
@@ -40,8 +41,6 @@ import (
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/modelarts"
"code.gitea.io/gitea/modules/redis/redis_key"
"code.gitea.io/gitea/modules/redis/redis_lock"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/storage"
"code.gitea.io/gitea/modules/util"
@@ -135,7 +134,7 @@ func cloudBrainNewDataPrepare(ctx *context.Context, jobType string) error {
defaultMode = "alogrithm"
}
ctx.Data["benchmarkMode"] = defaultMode
NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainOne, jobType)
NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, jobType)
ctx.Data["NotStopTaskCount"] = NotStopTaskCount
if ctx.Cloudbrain != nil {
@@ -229,15 +228,37 @@ func cloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
tpl = tplCloudBrainTrainJobNew
}
lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), jobType, displayJobName))
isOk, err := lock.Lock(models.CloudbrainKeyDuration)
if !isOk {
log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobType(jobType),
ComputeResource: models.GPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}
if !account.IsPointBalanceEnough(ctx.User.ID, models.PointDeductCondition{SpecUnitPrice: spec.UnitPrice}) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tpl, &form)
ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form)
return
}
lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: displayJobName, JobType: form.JobType}, User: ctx.User})
defer func() {
if lockOperator != nil {
lockOperator.Unlock()
}
}()
if errMsg != "" {
log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form)
return
}
defer lock.UnLock()
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName)
if err == nil {
@@ -269,7 +290,7 @@ func cloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
return
}
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainOne, jobType)
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, jobType)
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx, jobType)
@@ -285,7 +306,7 @@ func cloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
}
var datasetInfos map[string]models.DatasetInfo
var datasetNames string
// var
var attachSize int64
if uuids != "" {
datasetInfos, datasetNames, err = models.GetDatasetInfo(uuids)
if err != nil {
@@ -294,6 +315,18 @@ func cloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
return
}
if jobType == string(models.JobTypeDebug) {
for _, infos := range datasetInfos {
attachSize += infos.Size
}
if attachSize > int64(setting.DebugAttachSize*1000*1000*1000) {
log.Error("The DatasetSize exceeds the limit (%dGB)", setting.DebugAttachSize) // GB
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.debug_datasetsize", setting.DebugAttachSize), tpl, &form)
return
}
}
}
command := cloudbrain.GetCloudbrainDebugCommand()
@@ -329,24 +362,6 @@ func cloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobType(jobType),
ComputeResource: models.GPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}
if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form)
return
}
req := cloudbrain.GenerateCloudBrainTaskReq{
Ctx: ctx,
DisplayJobName: displayJobName,
@@ -387,7 +402,6 @@ func cloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
req.ModelVersion = form.ModelVersion
req.PreTrainModelPath = setting.Attachment.Minio.RealPath + form.PreTrainModelUrl
req.PreTrainModelUrl = form.PreTrainModelUrl
}
_, err = cloudbrain.GenerateTask(req)
@@ -451,15 +465,36 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
repo := ctx.Repo.Repository
tpl := tplCloudBrainInferenceJobNew
lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), jobType, displayJobName))
isOk, err := lock.Lock(models.CloudbrainKeyDuration)
if !isOk {
log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeInference,
ComputeResource: models.GPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}
if !account.IsPointBalanceEnough(ctx.User.ID, models.PointDeductCondition{SpecUnitPrice: spec.UnitPrice}) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tpl, &form)
ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form)
return
}
lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: displayJobName, JobType: jobType}, User: ctx.User})
defer func() {
if lockOperator != nil {
lockOperator.Unlock()
}
}()
if errMsg != "" {
log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form)
return
}
defer lock.UnLock()
ckptUrl := setting.Attachment.Minio.RealPath + form.TrainUrl + form.CkptName
log.Info("ckpt url:" + ckptUrl)
@@ -502,7 +537,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
return
}
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainOne, jobType)
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, jobType)
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx, jobType)
@@ -536,22 +571,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
return
}
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeInference,
ComputeResource: models.GPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}
if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form)
return
}
req := cloudbrain.GenerateCloudBrainTaskReq{
Ctx: ctx,
DisplayJobName: displayJobName,
@@ -592,8 +612,10 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
}
/**
检查用户传输的参数是否符合专属资源池
/*
*
检查用户传输的参数是否符合专属资源池
*/
func checkCloudBrainSpecialPool(ctx *context.Context, jobType string, queue string, resourceSpecId int) string {
if cloudbrain.SpecialPools != nil {
@@ -634,25 +656,21 @@ func CloudBrainRestart(ctx *context.Context) {
var errorMsg = ""
var status = string(models.JobWaiting)
task := ctx.Cloudbrain
for {
if task.Status != string(models.JobStopped) && task.Status != string(models.JobSucceeded) && task.Status != string(models.JobFailed) {
log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job is not stopped"
break
}
if task.Image == "" || task.GpuQueue == "" || task.Type != models.TypeCloudBrainOne {
log.Error("the job(%s) version is too old", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job's version is too old and can not be restarted"
break
lockOperator, errMsg := cloudbrainService.Lock4CloudbrainRestart(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{JobType: task.JobType}, User: ctx.User})
defer func() {
if lockOperator != nil {
lockOperator.Unlock()
}
}()
if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin()) {
log.Error("the user has no right ro restart the job", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "you have no right to restart the job"
if errMsg != "" {
log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = ctx.Tr(errMsg)
}
for {
if errorMsg != "" {
break
}
@@ -676,13 +694,34 @@ func CloudBrainRestart(ctx *context.Context) {
}
task.Spec = spec
if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
if !account.IsPointBalanceEnough(ctx.User.ID, models.PointDeductCondition{SpecUnitPrice: spec.UnitPrice} ) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
resultCode = "-1"
errorMsg = ctx.Tr("points.insufficient_points_balance")
break
}
if task.Status != string(models.JobStopped) && task.Status != string(models.JobSucceeded) && task.Status != string(models.JobFailed) {
log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job is not stopped"
break
}
if task.Image == "" || task.GpuQueue == "" || task.Type != models.TypeCloudBrainOne {
log.Error("the job(%s) version is too old", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job's version is too old and can not be restarted"
break
}
if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin()) {
log.Error("the user has no right ro restart the job", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "you have no right to restart the job"
break
}
if _, err := os.Stat(getOldJobPath(task)); err != nil {
log.Error("Can not find job minio path", err)
resultCode = "-1"
@@ -690,7 +729,7 @@ func CloudBrainRestart(ctx *context.Context) {
break
}
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainOne, string(models.JobTypeDebug))
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, string(models.JobTypeDebug))
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
resultCode = "-1"
@@ -843,7 +882,7 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
if task.JobType == string(models.JobTypeBenchmark) {
task.BenchmarkType = ctx.Tr("repo.cloudbrain.benchmark.algorithm")
} else if task.JobType == string(models.JobTypeSnn4imagenet) || task.JobType == string(models.JobTypeBrainScor e) {
} else if models.IsModelBenchMarkJobType(task.JobTyp e) {
task.BenchmarkType = ctx.Tr("repo.cloudbrain.benchmark.model")
task.BenchmarkTypeName = task.JobType
ctx.Data["BenchmarkTypeName"] = task.JobType
@@ -1309,8 +1348,8 @@ func DeleteJobsByRepoID(repoID int64) {
DeleteJobs(cloudBrains)
}
/**
/*
*
*/
func StopJobs(cloudBrains []*models.Cloudbrain) {
@@ -1340,14 +1379,11 @@ func StopJobs(cloudBrains []*models.Cloudbrain) {
logErrorAndUpdateJobStatus(err, taskInfo)
}
} else if taskInfo.Type == models.TypeC2Net {
if taskInfo.JobType == string(models.JobTypeTrain) {
err := retry(3, time.Second*30, func() error {
_, err := grampus.StopJob(taskInfo.JobID)
return err
})
logErrorAndUpdateJobStatus(err, taskInfo)
}
err := retry(3, time.Second*30, func() error {
_, err := grampus.StopJob(taskInfo.JobID, taskInfo.JobType)
return err
})
logErrorAndUpdateJobStatus(err, taskInfo)
}
}
}
@@ -1692,6 +1728,8 @@ func GetRate(ctx *context.Context) {
ctx.Redirect(setting.Snn4imagenetServerHost)
} else if job.JobType == string(models.JobTypeBrainScore) {
ctx.Redirect(setting.BrainScoreServerHost)
} else if job.JobType == string(models.JobTypeSnn4Ecoset) {
ctx.Redirect(setting.Snn4EcosetServerHost)
} else {
log.Error("JobType error:%s", job.JobType, ctx.Data["msgID"])
}
@@ -2164,7 +2202,7 @@ func CloudBrainBenchmarkIndex(ctx *context.Context) {
}
var jobTypes []string
jobTypes = append(jobTypes, string(models.JobTypeBenchmark), string(models.JobTypeBrainScore), string(models.JobTypeSnn4imagenet), string(models.JobTypeModelSafety))
jobTypes = append(jobTypes, string(models.JobTypeBenchmark), string(models.JobTypeBrainScore), string(models.JobTypeSnn4imagenet), string(models.JobTypeSnn4Ecoset), string(models.JobType ModelSafety))
ciTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
ListOptions: models.ListOptions{
Page: page,
@@ -2197,14 +2235,16 @@ func CloudBrainBenchmarkIndex(ctx *context.Context) {
ciTasks[i].BenchmarkTypeName = ""
if ciTasks[i].JobType == string(models.JobTypeBenchmark) {
ciTasks[i].BenchmarkType = ctx.Tr("repo.cloudbrain.benchmark.algorithm")
} else if ciTasks[i].JobType == string(models.JobTypeSnn4imagenet) || ciTasks[i].JobType == string(models.JobTypeBrainScor e) {
} else if models.IsModelBenchMarkJobType(ciTasks[i].JobTyp e) {
ciTasks[i].BenchmarkType = ctx.Tr("repo.cloudbrain.benchmark.model")
ciTasks[i].BenchmarkTypeName = ciTasks[i].JobType
if ciTasks[i].JobType == string(models.JobTypeSnn4imagenet) {
ciTasks[i].BenchmarkTypeRankLink = setting.Snn4imagenetServerHost
} else {
} else if ciTasks[i].JobType == string(models.JobTypeBrainScore) {
ciTasks[i].BenchmarkTypeRankLink = setting.BrainScoreServerHost
} else {
ciTasks[i].BenchmarkTypeRankLink = setting.Snn4EcosetServerHost
}
}
@@ -2373,17 +2413,38 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo
ctx.Data["benchmarkTypeID"] = benchmarkTypeID
ctx.Data["benchmark_child_types_id_hidden"] = benchmarkChildTypeID
lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), form.JobType, displayJobName))
isOk, err := lock.Lock(models.CloudbrainKeyDuration)
if !isOk {
log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeBenchmark,
ComputeResource: models.GPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr("Resource specification not available", tplCloudBrainBenchmarkNew, &form)
return
}
if !account.IsPointBalanceEnough(ctx.User.ID, models.PointDeductCondition{SpecUnitPrice: spec.UnitPrice}) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainBenchmarkNew, &form)
return
}
lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: displayJobName, JobType: string(models.JobTypeBenchmark)}, User: ctx.User})
defer func() {
if lockOperator != nil {
lockOperator.Unlock()
}
}()
if errMsg != "" {
log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplCloudBrainBenchmarkNew, &form)
ctx.RenderWithErr(ctx.Tr(errMsg ), tplCloudBrainBenchmarkNew, &form)
return
}
defer lock.UnLock()
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName)
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeBenchmark) , displayJobName)
if err == nil {
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
@@ -2414,24 +2475,7 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo
return
}
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeBenchmark,
ComputeResource: models.GPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr("Resource specification not available", tplCloudBrainBenchmarkNew, &form)
return
}
if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainBenchmarkNew, &form)
return
}
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainOne, jobType)
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, string(models.JobTypeBenchmark))
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx, jobType)
@@ -2554,7 +2598,6 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
displayJobName := form.DisplayJobName
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
image := form.Image
uuid := form.Attachment
jobType := form.JobType
codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
branchName := cloudbrain.DefaultBranchName
@@ -2563,17 +2606,39 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
tpl := tplCloudBrainBenchmarkNew
command := cloudbrain.GetCloudbrainDebugCommand()
lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), jobType, displayJobName))
isOk, err := lock.Lock(models.CloudbrainKeyDuration)
if !isOk {
log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeBenchmark,
ComputeResource: models.GPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}
if !account.IsPointBalanceEnough(ctx.User.ID, models.PointDeductCondition{SpecUnitPrice: spec.UnitPrice}) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tpl, &form)
ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance "), tpl, &form)
return
}
defer lock.UnLock()
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName)
lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: displayJobName, JobType: string(models.JobTypeBenchmark)}, User: ctx.User})
defer func() {
if lockOperator != nil {
lockOperator.Unlock()
}
}()
if errMsg != "" {
log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form)
return
}
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeBenchmark), displayJobName)
if err == nil {
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
@@ -2596,14 +2661,14 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
return
}
if jobType != string(models.JobTypeSnn4imagenet) && jobType != string(models.JobTypeBrainScor e) {
if !models.IsModelBenchMarkJobType(jobTyp e) {
log.Error("jobtype error:", jobType, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr("jobtype error", tpl, &form)
return
}
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainOne, jobType )
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, string(models.JobTypeBenchmark) )
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx, jobType)
@@ -2625,46 +2690,41 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
mkModelPath(modelPath)
uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/")
snn4imagenetPath := setting.JobPath + jobName + cloudbrain.Snn4imagenet MountPath
benchmarkPath := setting.JobPath + jobName + cloudbrain.BenchMark MountPath
if setting.IsSnn4imagenetEnabled && jobType == string(models.JobTypeSnn4imagenet) {
downloadRateCode(repo, jobName, setting.Snn4imagenetOwner, setting.Snn4imagenetName, snn4imagenet Path, "", "", ctx.User.Name)
uploadCodeToMinio(snn4imagenetPath+"/", jobName, cloudbrain.Snn4imagenet MountPath+"/")
command = fmt.Sprintf(cloudbrain.Snn4imagenetCommand, displayJobName, trimSpaceNewlineInString(form.Description))
downloadRateCode(repo, jobName, setting.Snn4imagenetOwner, setting.Snn4imagenetName, benchmark Path, "", "", ctx.User.Name)
uploadCodeToMinio(benchmarkPath+"/", jobName, cloudbrain.BenchMark MountPath+"/")
command = fmt.Sprintf(cloudbrain.Snn4imagenetCommand, displayJobName, form.CkptName, trimSpaceNewlineInString(form.Description))
}
benchmarkChildTypeID := 0
brainScorePath := setting.JobPath + jobName + cloudbrain.BrainScoreMountPath
if setting.IsBrainScoreEnabled && jobType == string(models.JobTypeBrainScore) {
downloadRateCode(repo, jobName, setting.BrainScoreOwner, setting.BrainScoreName, brainScore Path, "", "", ctx.User.Name)
uploadCodeToMinio(brainScorePath+"/", jobName, cloudbrain.BrainScore MountPath+"/")
downloadRateCode(repo, jobName, setting.BrainScoreOwner, setting.BrainScoreName, benchmark Path, "", "", ctx.User.Name)
uploadCodeToMinio(benchmarkPath+"/", jobName, cloudbrain.BenchMark MountPath+"/")
benchmarkChildTypeID = form.BenchmarkChildTypeID
command = fmt.Sprintf(cloudbrain.BrainScoreCommand, getBrainRegion(benchmarkChildTypeID), displayJobName, trimSpaceNewlineInString(form.Description))
command = fmt.Sprintf(cloudbrain.BrainScoreCommand, getBrainRegion(benchmarkChildTypeID), displayJobName, form.CkptName, trimSpaceNewlineInString(form.Description))
}
var uuid string
var datasetInfos map[string]models.DatasetInfo
var datasetNames string
if setting.IsSnn4EcosetEnabled && jobType == string(models.JobTypeSnn4Ecoset) {
downloadRateCode(repo, jobName, setting.Snn4EcosetOwner, setting.Snn4EcosetName, benchmarkPath, "", "", ctx.User.Name)
uploadCodeToMinio(benchmarkPath+"/", jobName, cloudbrain.BenchMarkMountPath+"/")
command = fmt.Sprintf(cloudbrain.Snn4EcosetCommand, displayJobName, form.CkptName, trimSpaceNewlineInString(form.Description))
datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
return
}
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeBenchmark,
ComputeResource: models.GPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}
attachment, err := getEcosetAttachment()
if err != nil {
log.Error("load benchmark code failed", err)
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpl, &form)
return
}
uuid = attachment.UUID
datasetInfos, datasetNames, _ = models.GetDatasetInfo(uuid)
if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
cloudBrainNewDataPrepare(ctx, jobType)
ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form)
return
}
log.Info("Command=" + command)
log.Info("ModelPath=" + storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"))
req := cloudbrain.GenerateCloudBrainTaskReq{
@@ -2679,8 +2739,6 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
ModelPath: storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
JobType: jobType,
Description: form.Description,
BranchName: branchName,
@@ -2692,6 +2750,14 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
Spec: spec,
}
if form.ModelName != "" {
req.ModelName = form.ModelName
req.LabelName = form.LabelName
req.CkptName = form.CkptName
req.ModelVersion = form.ModelVersion
req.PreTrainModelPath = setting.Attachment.Minio.RealPath + form.PreTrainModelUrl
req.PreTrainModelUrl = form.PreTrainModelUrl
}
_, err = cloudbrain.GenerateTask(req)
if err != nil {
@@ -2703,6 +2769,21 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
}
func getEcosetAttachment() (*models.Attachment, error) {
ecosetRepo, err := models.GetRepositoryByOwnerAndName(setting.Snn4EcosetOwner, setting.Snn4EcosetName)
if err != nil {
return nil, err
}
datasetInfo, err := models.GetDatasetByRepo(ecosetRepo)
if err != nil {
return nil, err
}
return models.GetAttachmentByDatasetIdFileName(setting.Snn4AttachmentName, datasetInfo.ID)
}
func getBrainRegion(benchmarkChildTypeID int) string {
values := []string{"V1", "V2", "V4", "IT"}
return values[benchmarkChildTypeID]
@@ -2763,18 +2844,24 @@ func InferenceCloudBrainJobShow(ctx *context.Context) {
cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeInference)
}
func DownloadInferenceResultFile(ctx *context.Context) {
func DownloadGPU InferenceResultFile(ctx *context.Context) {
var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("version_name")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
return
}
allFile, err := storage.GetAllObjectByBucketAndPrefixMinio(setting.Attachment.Minio.Bucket, task.ResultUrl)
returnFileName := task.DisplayJobName + ".zip"
MinioDownloadManyFile(task.ResultUrl, ctx, returnFileName, allFile)
parentDir := ctx.Query("parentDir")
filePath := "jobs/" + task.JobName + "/result/" + parentDir
log.Info("prefix=" + filePath)
allFile, err := storage.GetAllObjectByBucketAndPrefixMinio(setting.Attachment.Minio.Bucket, filePath)
if err == nil {
returnFileName := task.DisplayJobName + ".zip"
MinioDownloadManyFile(filePath, ctx, returnFileName, allFile)
} else {
log.Info("error,msg=" + err.Error())
ctx.ServerError("no file to download.", err)
}
}
func getInferenceJobCommand(form auth.CreateCloudBrainInferencForm) (string, error) {