4 Commits

3 changed files with 56 additions and 14 deletions
Split View
  1. +1
    -0
      server/base-server/internal/common/constant.go
  2. +40
    -1
      server/base-server/internal/service/trainjob/train_job.go
  3. +15
    -13
      server/base-server/internal/service/trainjob/train_job_test.go

+ 1
- 0
server/base-server/internal/common/constant.go View File

@@ -3,6 +3,7 @@ package common
const (
BillingPrecision = 2
RdmaPrefix = "rdma/"
NPUResourceName = "npu.huawei.com/NPU"
)

const (


+ 40
- 1
server/base-server/internal/service/trainjob/train_job.go View File

@@ -612,7 +612,7 @@ func (s *trainJobService) submitJob(ctx context.Context, job *model.TrainJob, st
{Event: vcBus.TaskCompletedEvent, Action: vcBus.CompleteJobAction},
}
}
//根据资源类型任务区别挂载与配置
for k, _ := range startJobInfo.specs[i.ResourceSpecId].resources {
if strings.HasPrefix(string(k), common.RdmaPrefix) {
task.Template.Spec.Containers[0].SecurityContext = &v1.SecurityContext{
@@ -621,6 +621,45 @@ func (s *trainJobService) submitJob(ctx context.Context, job *model.TrainJob, st
},
}
}

//NPU挂载与权限
if string(k) == common.NPUResourceName {
//1. privileged
//处理空情况
if task.Template.Spec.Containers[0].SecurityContext == nil {
task.Template.Spec.Containers[0].SecurityContext = &v1.SecurityContext{
}
}
privileged := true
task.Template.Spec.Containers[0].SecurityContext.Privileged = &privileged
//2.挂载/usr/local/Ascend/driver驱动与/etc/ascend_install.info驱动信息
task.Template.Spec.Volumes = append(task.Template.Spec.Volumes, v1.Volume{
Name: "ascend-driver-volume",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{
Path: "/usr/local/Ascend/driver",
},
},
},v1.Volume{
Name: "ascend-driver-info",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{
Path: "/etc/ascend_install.info",
},
},
})

task.Template.Spec.Containers[0].VolumeMounts = append(task.Template.Spec.Containers[0].VolumeMounts,
v1.VolumeMount{
Name: "ascend-driver-volume",
MountPath: "/usr/local/Ascend/driver",
},
v1.VolumeMount{
Name: "ascend-driver-info",
MountPath: "/etc/ascend_install.info",
})

}
}
tasks = append(tasks, task)
}


+ 15
- 13
server/base-server/internal/service/trainjob/train_job_test.go View File

@@ -45,28 +45,30 @@ func TestCreateTrainJob(t *testing.T) {
baseServerConfig := make([]*api.Config, 0)
parameters := make([]*api.Parameter, 0)
parameter := &api.Parameter{
Key: "key-1",
Value: "value-1",
}
parameters = append(parameters, parameter)
reqConfig := &api.Config{
Command: "sleep 100",
Command: "python /code/npu-test/npu-test.py",
Parameters: parameters,
ResourceSpecId: "resourceSpecId1",
ResourceSpecId: "853765f501bb4f69be6fe72d3b36917d",
TaskNumber: 1,
MinFailedTaskCount: 1,
MinSucceededTaskCount: 1,
}
baseServerConfig = append(baseServerConfig, reqConfig)

reply, err := client.TrainJob(ctx, &api.TrainJobRequest{
UserId: "ddbe4b31-cc13-416f-aa80-97495abb80c2",
WorkspaceId: "workspace_id_1",
Name: "train-job-test-000",
UserId: "cfb1f7a8cb0a4eb6a5a20987765dbf23",
WorkspaceId: "default-workspace",
Name: "npu-test-1",
Desc: "this is a test",
AlgorithmId: "algorithmId",
AlgorithmVersion: "algorithmVersion",
ImageId: "imageId",
DataSetId: "dataSetId",
DataSetVersion: "0.0.1",
AlgorithmId: "2bbe052081074fe5bc52da486559cd3d",
AlgorithmVersion: "V1",
ImageId: "6baf34d8836b425186a56f5179ec088b",
DataSetId: "9b402ecb53364336b99f5423fe5efb75",
DataSetVersion: "V1",
IsDistributed: false,
ResourcePool: "common-pool",
Config: baseServerConfig,
})
if err != nil {
@@ -83,7 +85,7 @@ func TestTrainJobInfo(t *testing.T) {
}

ctx := context.Background()
reply, err := client.GetTrainJobInfo(ctx, &api.TrainJobInfoRequest{Id: "c1dd7af8-98c1-41cc-9491-934dc25b0692"})
reply, err := client.GetTrainJobInfo(ctx, &api.TrainJobInfoRequest{Id: "j21e6fac76aa41dabd8ffce92aa43768"})
if err != nil {
panic(err)
}


Loading…
Cancel
Save
Baidu
map