X Tutup
Skip to content

Commit 3281909

Browse files
committed
Handle shim being sigkilled while containerd is down
Signed-off-by: Kenfe-Mickael Laventure <mickael.laventure@gmail.com>
1 parent b6b2fd6 commit 3281909

File tree

3 files changed

+122
-2
lines changed

3 files changed

+122
-2
lines changed

runtime/container.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,10 @@ func (c *container) waitForCreate(p *process, cmd *exec.Cmd) error {
629629
if err != nil {
630630
return err
631631
}
632+
err = p.saveStartTime()
633+
if err != nil {
634+
logrus.Warnf("containerd: unable to save %s:%s starttime: %v", p.container.id, p.id)
635+
}
632636
return nil
633637
case <-time.After(c.timeout):
634638
cmd.Process.Kill()

runtime/process.go

Lines changed: 115 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@ import (
99
"os/exec"
1010
"path/filepath"
1111
"strconv"
12+
"strings"
1213
"sync"
1314
"syscall"
15+
"time"
1416

1517
"github.com/Sirupsen/logrus"
1618
"github.com/docker/containerd/specs"
@@ -126,6 +128,13 @@ func loadProcess(root, id string, c *container, s *ProcessState) (*process, erro
126128
},
127129
state: Stopped,
128130
}
131+
132+
startTime, err := ioutil.ReadFile(filepath.Join(p.root, StartTimeFile))
133+
if err != nil && !os.IsNotExist(err) {
134+
return nil, err
135+
}
136+
p.startTime = string(startTime)
137+
129138
if _, err := p.getPidFromFile(); err != nil {
130139
return nil, err
131140
}
@@ -151,6 +160,30 @@ func loadProcess(root, id string, c *container, s *ProcessState) (*process, erro
151160
return p, nil
152161
}
153162

163+
func readProcStatField(pid int, field int) (string, error) {
164+
data, err := ioutil.ReadFile(filepath.Join(string(filepath.Separator), "proc", strconv.Itoa(pid), "stat"))
165+
if err != nil {
166+
return "", err
167+
}
168+
169+
if field > 2 {
170+
// First, split out the name since he could contains spaces.
171+
parts := strings.Split(string(data), ") ")
172+
// Now split out the rest, we end up with 2 fields less
173+
parts = strings.Split(parts[1], " ")
174+
return parts[field-2-1], nil // field count start at 1 in manual
175+
}
176+
177+
parts := strings.Split(string(data), " (")
178+
179+
if field == 1 {
180+
return parts[0], nil
181+
}
182+
183+
parts = strings.Split(parts[1], ") ")
184+
return parts[0], nil
185+
}
186+
154187
type process struct {
155188
root string
156189
id string
@@ -165,6 +198,7 @@ type process struct {
165198
cmdDoneCh chan struct{}
166199
state State
167200
stateLock sync.Mutex
201+
startTime string
168202
}
169203

170204
func (p *process) ID() string {
@@ -195,7 +229,47 @@ func (p *process) Resize(w, h int) error {
195229
}
196230

197231
func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
198-
if rerr == nil || p.cmd == nil || p.cmd.Process == nil {
232+
if p.cmd == nil || p.cmd.Process == nil {
233+
e := unix.Kill(p.pid, 0)
234+
if e == syscall.ESRCH {
235+
return rst, rerr
236+
}
237+
238+
// If it's not the same process, just mark it stopped and set
239+
// the status to 255
240+
if same, err := p.isSameProcess(); !same {
241+
logrus.Warnf("containerd: %s:%s (pid %d) is not the same process anymore (%v)", p.container.id, p.id, p.pid, err)
242+
p.stateLock.Lock()
243+
p.state = Stopped
244+
p.stateLock.Unlock()
245+
// Create the file so we get the exit event generated once monitor kicks in
246+
// without going to this all process again
247+
rerr = ioutil.WriteFile(filepath.Join(p.root, ExitStatusFile), []byte("255"), 0644)
248+
return 255, nil
249+
}
250+
251+
ppid, err := readProcStatField(p.pid, 4)
252+
if err != nil {
253+
return rst, fmt.Errorf("could not check process ppid: %v (%v)", err, rerr)
254+
}
255+
if ppid == "1" {
256+
logrus.Warnf("containerd: %s:%s shim died, killing associated process", p.container.id, p.id)
257+
unix.Kill(p.pid, syscall.SIGKILL)
258+
// wait for the process to die
259+
for {
260+
e := unix.Kill(p.pid, 0)
261+
if e == syscall.ESRCH {
262+
break
263+
}
264+
time.Sleep(10 * time.Millisecond)
265+
}
266+
267+
rst = 128 + int(syscall.SIGKILL)
268+
// Create the file so we get the exit event generated once monitor kicks in
269+
// without going to this all process again
270+
rerr = ioutil.WriteFile(filepath.Join(p.root, ExitStatusFile), []byte(fmt.Sprintf("%d", rst)), 0644)
271+
}
272+
199273
return rst, rerr
200274
}
201275

@@ -218,6 +292,9 @@ func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
218292
wpid int
219293
)
220294

295+
// Some processes change their PR_SET_PDEATHSIG, so force kill them
296+
unix.Kill(p.pid, syscall.SIGKILL)
297+
221298
for wpid == 0 {
222299
wpid, e = unix.Wait4(p.pid, &status, unix.WNOHANG, &rusage)
223300
if e != nil {
@@ -244,7 +321,9 @@ func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
244321
func (p *process) ExitStatus() (rst int, rerr error) {
245322
data, err := ioutil.ReadFile(filepath.Join(p.root, ExitStatusFile))
246323
defer func() {
247-
rst, rerr = p.handleSigkilledShim(rst, rerr)
324+
if rerr != nil {
325+
rst, rerr = p.handleSigkilledShim(rst, rerr)
326+
}
248327
}()
249328
if err != nil {
250329
if os.IsNotExist(err) {
@@ -297,6 +376,40 @@ func (p *process) getPidFromFile() (int, error) {
297376
return i, nil
298377
}
299378

379+
func (p *process) readStartTime() (string, error) {
380+
return readProcStatField(p.pid, 22)
381+
}
382+
383+
func (p *process) saveStartTime() error {
384+
startTime, err := p.readStartTime()
385+
if err != nil {
386+
return err
387+
}
388+
389+
p.startTime = startTime
390+
return ioutil.WriteFile(filepath.Join(p.root, StartTimeFile), []byte(startTime), 0644)
391+
}
392+
393+
func (p *process) isSameProcess() (bool, error) {
394+
// for backward compat assume it's the same if startTime wasn't set
395+
if p.startTime == "" {
396+
return true, nil
397+
}
398+
if p.pid == 0 {
399+
_, err := p.getPidFromFile()
400+
if err != nil {
401+
return false, err
402+
}
403+
}
404+
405+
startTime, err := p.readStartTime()
406+
if err != nil {
407+
return false, err
408+
}
409+
410+
return startTime == p.startTime, nil
411+
}
412+
300413
// Wait will reap the shim process
301414
func (p *process) Wait() {
302415
if p.cmdDoneCh != nil {

runtime/runtime.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ const (
4444
// InitProcessID holds the special ID used for the very first
4545
// container's process
4646
InitProcessID = "init"
47+
// StartTimeFile holds the name of the file in which the process
48+
// start time is saved
49+
StartTimeFile = "starttime"
4750
)
4851

4952
// Checkpoint holds information regarding a container checkpoint

0 commit comments

Comments
 (0)
X Tutup