机器重启后本地盘不见了

机器重启后本地盘不见了

机器重启后本地盘不见了

一天测试同事跟我们MM说主机好慢,登录不上。 MM登录控制台查看内存占满了,沟通了一下,重启恢复业务先,回头限制占内存较多的应用,调小java heap size.

不料,重启后,/wls卷不见了。

MM找我解决,不要慌,我看看。

看看现场

# df -h
Filesystem      Size  Used Avail Use% Mounted on
/dev/vda1        50G  5.5G   42G  12% /
devtmpfs         16G     0   16G   0% /dev
tmpfs            16G   24K   16G   1% /dev/shm
tmpfs            16G  516K   16G   1% /run
tmpfs            16G     0   16G   0% /sys/fs/cgroup
tmpfs           3.2G     0  3.2G   0% /run/user/0
tmpfs           3.2G     0  3.2G   0% /run/user/698

配置文件是不是配错了?

# cat /etc/fstab 
/dev/vda1            /                    ext3       noatime,acl,user_xattr 1 1
proc                 /proc                proc       defaults              0 0
sysfs                /sys                 sysfs      noauto                0 0
debugfs              /sys/kernel/debug    debugfs    noauto                0 0
devpts               /dev/pts             devpts     mode=0620,gid=5       0 0
/dev/VolGroup01/LVwls /wls ext4 defaults 0 0
stgnas:publicdata /wls/data glusterfs defaults 0 0

格式错了?
也不是,配置都有,格式也对。

strace一把。

# strace mount -a
# 省略一堆信息。。。
readlink("/dev", 0x7fff6caedfe0, 4096)  = -1 EINVAL (Invalid argument)
readlink("/dev/VolGroup01", 0x7fff6caedfe0, 4096) = -1 EINVAL (Invalid argument)
readlink("/dev/VolGroup01/LVwls", "../dm-0", 4096) = 7
readlink("/dev/dm-0", 0x7fff6caedfe0, 4096) = -1 EINVAL (Invalid argument)
open("/sys/block/dm-0/dm/name", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=4096, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2ab01e477000
read(3, "VolGroup01-LVwls\n", 4096)     = 17
access("/dev/mapper/VolGroup01-LVwls", F_OK) = 0
close(3)                                = 0
munmap(0x2ab01e477000, 4096)            = 0
getcwd("/dsroot", 4095)                 = 8
readlink("/dsroot/stgnas:publicdata", 0x7fff6caedfe0, 4096) = -1 ENOENT (No such file or directory)
close(1)                                = 0
close(2)                                = 0
exit_group(0)                           = ?
+++ exited with 0 +++

看到了,无效参数。
原来是逻辑卷有问题。

# lvs
  LV    VG         Attr       LSize   Pool Origin Data%  Meta%  Move Log Cpy%Sync Convert
  LVwls VolGroup01 -wi-ao---- <50.00g                              

what? 没啥问题呀

lrwxrwxrwx 1 root root 7 Jan 25 15:52 /dev/mapper/VolGroup01-LVwls -> ../dm-0
drwxr-xr-x 2 root root 80 Jan 25 15:52 /dev/mapper
drwxr-xr-x 17 root root 2920 Jan 25 15:52 /dev
# dmsetup ls
VolGroup01-LVwls        (252:0)

大脑高速运转中。。。


# df -h Filesystem Size Used Avail Use% Mounted on /dev/vda1 50G 5.4G 42G 12% / devtmpfs 16G 0 16G 0% /dev tmpfs 16G 24K 16G 1% /dev/shm tmpfs 16G 572K 16G 1% /run tmpfs 16G 0 16G 0% /sys/fs/cgroup tmpfs 3.2G 0 3.2G 0% /run/user/0 tmpfs 3.2G 0 3.2G 0% /run/user/602 tmpfs 3.2G 0 3.2G 0% /run/user/698 # cat /etc/mtab rootfs / rootfs rw 0 0 sysfs /sys sysfs rw,relatime 0 0 proc /proc proc rw,relatime 0 0 devtmpfs /dev devtmpfs rw,nosuid,size=16380992k,nr_inodes=4095248,mode=755 0 0 securityfs /sys/kernel/security securityfs rw,nosuid,nodev,noexec,relatime 0 0 tmpfs /dev/shm tmpfs rw,nosuid,nodev 0 0 devpts /dev/pts devpts rw,relatime,gid=5,mode=620,ptmxmode=000 0 0 tmpfs /run tmpfs rw,nosuid,nodev,mode=755 0 0 tmpfs /sys/fs/cgroup tmpfs ro,nosuid,nodev,noexec,mode=755 0 0 cgroup /sys/fs/cgroup/systemd cgroup rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd 0 0 pstore /sys/fs/pstore pstore rw,nosuid,nodev,noexec,relatime 0 0 cgroup /sys/fs/cgroup/pids cgroup rw,nosuid,nodev,noexec,relatime,pids 0 0 cgroup /sys/fs/cgroup/hugetlb cgroup rw,nosuid,nodev,noexec,relatime,hugetlb 0 0 cgroup /sys/fs/cgroup/cpuset cgroup rw,nosuid,nodev,noexec,relatime,cpuset 0 0 cgroup /sys/fs/cgroup/net_cls,net_prio cgroup rw,nosuid,nodev,noexec,relatime,net_prio,net_cls 0 0 cgroup /sys/fs/cgroup/freezer cgroup rw,nosuid,nodev,noexec,relatime,freezer 0 0 cgroup /sys/fs/cgroup/cpu,cpuacct cgroup rw,nosuid,nodev,noexec,relatime,cpuacct,cpu 0 0 cgroup /sys/fs/cgroup/devices cgroup rw,nosuid,nodev,noexec,relatime,devices 0 0 cgroup /sys/fs/cgroup/blkio cgroup rw,nosuid,nodev,noexec,relatime,blkio 0 0 cgroup /sys/fs/cgroup/memory cgroup rw,nosuid,nodev,noexec,relatime,memory 0 0 cgroup /sys/fs/cgroup/perf_event cgroup rw,nosuid,nodev,noexec,relatime,perf_event 0 0 configfs /sys/kernel/config configfs rw,relatime 0 0 /dev/vda1 / ext3 rw,noatime,data=ordered 0 0 systemd-1 /proc/sys/fs/binfmt_misc autofs rw,relatime,fd=36,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=10148 0 0 debugfs /sys/kernel/debug debugfs rw,relatime 0 0 hugetlbfs /dev/hugepages hugetlbfs rw,relatime 0 0 mqueue /dev/mqueue mqueue rw,relatime 0 0 /dev/mapper/VolGroup01-LVwls /wls ext4 rw,relatime,data=ordered 0 0 sunrpc /var/lib/nfs/rpc_pipefs rpc_pipefs rw,relatime 0 0 stgnas:publicdata /wls/data fuse.glusterfs rw,relatime,user_id=0,group_id=0,default_permissions,allow_other,max_read=131072 0 0 fusectl /sys/fs/fuse/connections fusectl rw,relatime 0 0 tmpfs /run/user/0 tmpfs rw,nosuid,nodev,relatime,size=3278124k,mode=700 0 0 tmpfs /run/user/602 tmpfs rw,nosuid,nodev,relatime,size=3278124k,mode=700,uid=602,gid=602 0 0 tmpfs /run/user/698 tmpfs rw,nosuid,nodev,relatime,size=3278124k,mode=700,uid=698,gid=601 0 0 # cat /etc/fstab /dev/vda1 / ext3 noatime,acl,user_xattr 1 1 proc /proc proc defaults 0 0 sysfs /sys sysfs noauto 0 0 debugfs /sys/kernel/debug debugfs noauto 0 0 devpts /dev/pts devpts mode=0620,gid=5 0 0 /dev/VolGroup01/LVwls /wls ext4 defaults 0 0 stgnas:publicdata /wls/data glusterfs defaults 0 0 #10.1.105.118:/DSPS /DSPS_FTP nfs defaults 0 0 # # # # cat -A /etc/fstab /dev/vda1 / ext3 noatime,acl,user_xattr 1 1$ proc /proc proc defaults 0 0$ sysfs /sys sysfs noauto 0 0$ debugfs /sys/kernel/debug debugfs noauto 0 0$ devpts /dev/pts devpts mode=0620,gid=5 0 0$ /dev/VolGroup01/LVwls /wls ext4 defaults 0 0$ stgnas:publicdata /wls/data glusterfs defaults 0 0$

看看会不会进入维护模式,再重启一把。
机器起来后,依然没有挂载卷,strace 再跟踪一次。

# strace mount -a
# 省略一堆信息。。。
link("/etc/mtab~.2240", "/etc/mtab~")   = -1 EEXIST (File exists)
open("/etc/mtab~", O_WRONLY|O_CLOEXEC)  = 3
rt_sigaction(SIGALRM, {0x2b9ff8de81a0, ~[RTMIN RT_1], SA_RESTORER, 0x2b9ff96b2270}, {SIG_DFL, [], SA_RESTORER, 0x2b9ff96b2270}, 8) = 0
alarm(1)                                = 0
fcntl(3, F_SETLKW, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = 0
alarm(0)                                = 1
rt_sigaction(SIGALRM, {SIG_DFL, [], SA_RESTORER, 0x2b9ff96b2270}, NULL, 8) = 0
nanosleep({0, 5000000}, NULL)           = 0
close(3)                                = 0
link("/etc/mtab~.2240", "/etc/mtab~")   = -1 EEXIST (File exists)

终于看到曙光了:strace 反馈 /etc/mtab这个文件已经存在了,正常它应该是个软连接。


# ll /proc/self/mounts -r--r--r-- 1 root root 0 Jan 25 16:09 /proc/self/mounts # file /etc/mtab /etc/mtab: ASCII text

找到问题,解决就简单了:


# rm /etc/mtab rm: remove regular file ‘/etc/mtab’? y # ln -s /proc/self/mounts /etc/mtab

mount -a 搞定收工。

问了相关同事,没人修改这个文件,难道是腾讯云的bug? 呵呵

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注