ubuntu 16.04安装openPai v1.8.0

ubuntu 16.04安装openPai v1.8.0

abdulla1992
2024-02-04 / 0 评论 / 65 阅读 / 正在检测是否收录...

准备 3台服务器

`172.2.17.105  devBox(管理机器,用于部署和配置环境,可以是虚拟机)`

`172.2.17.106 master(主节点)`

`172.22.17.107 worker(工作机器)`

科学上网

快帆云 (kfcloud.xyz)

三台机器环境安装

1.设置源

vim /etc/apt/sources.list

deb https://mirrors.aliyun.com/ubuntu/ xenial main
deb-src https://mirrors.aliyun.com/ubuntu/ xenial main

deb https://mirrors.aliyun.com/ubuntu/ xenial-updates main
deb-src https://mirrors.aliyun.com/ubuntu/ xenial-updates main

deb https://mirrors.aliyun.com/ubuntu/ xenial universe
deb-src https://mirrors.aliyun.com/ubuntu/ xenial universe
deb https://mirrors.aliyun.com/ubuntu/ xenial-updates universe
deb-src https://mirrors.aliyun.com/ubuntu/ xenial-updates universe

deb https://mirrors.aliyun.com/ubuntu/ xenial-security main
deb-src https://mirrors.aliyun.com/ubuntu/ xenial-security main
deb https://mirrors.aliyun.com/ubuntu/ xenial-security universe
deb-src https://mirrors.aliyun.com/ubuntu/ xenial-security universe

deb [arch=amd64] https://download.docker.com/linux/ubuntu xenial stable
# deb-src [arch=amd64] https://download.docker.com/linux/ubuntu xenial stable

更新源

apt-get update

安装vim

apt-get install vim

安装openssh

apt-get install openssh-server

安装ntp

apt-get install ntp

设置ssh - root远程

vim /etc/ssh/sshd_config
#设置为yes
PermitRootLogin yes
#重启ssh
systemctl restart ssh

安装docker

apt-get remove docker docker-engine docker-ce docker.io
apt-get update
apt-get install -y apt-transport-https ca-certificates curl software-properties-common
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
apt-get update
apt-get install -y docker-ce
systemctl start docke

设置docker仓库

mkdir -p /etc/docker
tee /etc/docker/daemon.json <<-'EOF'
{
  "registry-mirrors": ["https://registry.cn-hangzhou.aliyuncs.com"]
}
EOF
systemctl daemon-reload
systemctl restart docke

更新服务器软件

#解决 安装kuberstry 后 master和worker docker 起不来的问题
apt update && sudo apt upgrade -y

安装代理 ss/v2ray(二选1)

设置ss为允许其他机器连接

右键托盘ss图标,勾选允许其他设备连入

设置v2ray 允许其他

https://oss.abdl.cn/v2ray2.png

https://oss.abdl.cn/v2ray1.png

git 设置代理

git config --global http.proxy http://172.17.2.104:10809/
git config --global https.proxy http://172.17.2.104:10809/

docker 设置代理

vim /etc/systemd/system/docker.service.d/proxy.conf
##新增一下部分
[Service]
Environment="HTTP_PROXY=http://172.17.2.104:10809"
Environment="HTTPS_PROXY=http://172.17.2.104:10809/"
Environment="NO_PROXY="localhost,127.0.0.1,::1"
##重启生效
systemctl daemon-reload
systemctl restart docke

openPai官网安装指导

https://openpai.readthedocs.io/

拉取代码

git clone https://github.com/microsoft/pai.git
cd pai
git checkout v1.8.0

修改配置文件

vim contrib/kubespray/config
#编辑config.yaml
vim config.yaml
#编辑layout.yaml
vim layout.yaml

config.yaml


user: root
password: admin123
docker_image_tag: v1.8.0


mage_repo: "registry.cn-hangzhou.aliyuncs.com"
kube_image_repo: "registry.cn-hangzhou.aliyuncs.com/google_containers"

openpai_kubespray_extra_var:
  pod_infra_image_repo: "registry.cn-hangzhou.aliyuncs.com/google_containers/pause-{{ image_arch }}"
  dnsautoscaler_image_repo: "docker.io/mirrorgooglecontainers/cluster-proportional-autoscaler-{{ image_arch }}"
  tiller_image_repo: "registry.cn-hangzhou.aliyuncs.com/google_containers/kubernetes-helm/tiller"
  registry_proxy_image_repo: "registry.cn-hangzhou.aliyuncs.com/google_containers/kube-registry-proxy"
  metrics_server_image_repo: "registry.cn-hangzhou.aliyuncs.com/google_containers/metrics-server-amd64"
  addon_resizer_image_repo: "registry.cn-hangzhou.aliyuncs.com/google_containers/addon-resizer"
  dashboard_image_repo: "registry.cn-hangzhou.aliyuncs.com/google_containers/kubernetes-dashboard-{{ image_arch }}"

layout.yaml

machine-sku:
  master-machine: # define a machine sku
    # the resource requirements for all the machines of this sku
    # We use the same memory format as Kubernetes, e.g. Gi, Mi
    # Reference: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-memory
    mem: 30Gi
    cpu:
      # the number of CPU vcores
      vcore: 6
  cpu-machine:
    mem: 30Gi
    cpu:
      vcore: 6

machine-list:
  - hostname: master # name of the machine, **do not** use upper case alphabet letters for hostname
    hostip: 172.17.2.106
    machine-type: master-machine # only one master-machine supported
    pai-master: "true"
  - hostname: worker
    hostip: 172.17.2.107
    machine-type: cpu-machine
    pai-worker: "true

设置pip清华源

参考https://mirrors-i.tuna.tsinghua.edu.cn/help/pypi/

pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple

安装scheme,kubenetes(解决安装k8s报错)

pip install scheme
pip install kubernetes
解决error: command ‘x86_64-linux-gnu-gcc‘ failed with exit status 1
apt-get install build-essential python3-dev libssl-dev libffi-dev libxml2 libxml2-dev libxslt1-dev zlib1g-dev

安装kubespray

cd pai/contrib/kubespray
/bin/bash quick-start-kubespray.sh -v

安装openPai

/bin/bash quick-start-service.sh

打开 openPai

http://${mapter-ip}

其他

启动dev-box-quick-start 容器,用于管理集群

docker run -itd \
        -e COLUMNS=$COLUMNS -e LINES=$LINES -e TERM=$TERM \
        -v /var/run/docker.sock:/var/run/docker.sock \
        -v ${HOME}/pai-deploy/cluster-cfg:/cluster-configuration  \
        -v ${HOME}/pai-deploy/kube:/root/.kube \
        -v /home/tt/pai:/mnt/pai \
        --pid=host \
        --privileged=true \
        --net=host \
        --name=dev-box-quick-start \
        openpai/dev-box:v1.8.0

进入容器

docker exec -it dev-box-quick-start bash

获取集群ID

cd /pai
./paictl.py config get-id
# 将集群中正在使用的设置拉取到一个文件夹中
# 集群设置文件夹包含三个文件: config.yaml, layout.yaml 和 services-configuration.yaml
# 其中 config.yaml 在 v1.7.0 版本之前存储于 ~/pai-deploy/cluster-cfg/config.yaml
# 如果 <config-folder>中已经有这三个文件了,老的文件将会被覆盖
./paictl.py config pull -o <config-folder>

# 将文件夹中的配置上传到集群内
# 集群设置文件夹应该包含三个文件: config.yaml, layout.yaml 和 services-configuration.yaml
# 其中 config.yaml 在 v1.7.0 版本之前存储于 ~/pai-deploy/cluster-cfg/config.yaml
# 只有被成功上传的配置才会被应用
./paictl.py config push -p <config-folder> -m service

# 停止所有PAI服务
./paictl.py service stop

# 启动所有PAI服务
./paictl.py service start

# 停止某些PAI服务
./paictl.py service stop -n <service-name-1> <service-name-2>

# 启动某些PAI服务
./paictl.py service start -n <service-name-1> <service-name-2>

修改webportal并打包

构建docker镜像(1,记得把webportal.common.dockerfile挪到上级目录,命令最后一个点别忘了)
docker build -t webportal-ab:v1 -f webportal.common.dockerfile .
构建docker镜像(2)
python build/pai_build.py build -s webportal -c /cluster-configuration/
docker 打标签
docker tag webportal:lateset abdulla1992/webportal-ab:v12
推送镜像
docker push abdulla1992/webportal-ab:v13
直接编辑k8s容器启动脚本
kubectl edit ds webportal-ds

其他问题

1.ssh 卡住

重启所有节点ssh 服务

systemctl ssh restart
0

评论 (0)

取消