do not sleep to kill a process that has already cleaned up and exited by itself (#672)

* abort kill delay if process closes after interrupt

if the process to be killed honours the SIGINT and closes down,
we should not sleep before trying to kill a process that is
no longer there

* make sure to wait for both wait and kill results when kill delay is 0

* use cmd.process.signal and cmd.process.kill instead of raw syscalls

* switch back to syscall as os.process.signal or kill do not signal the entire group

* remove unstable and slightly dangerous kill test

the test paniced if it ever got a nil err since t.Failf does not
imply that test is stopped

selecting a random pid to kill might have interesting side effects
when run and its result can never be guaranteed to be the same, meaning
that the test had intermittent failures

no code depends on the specific return code of killCmd, so there
does not seem to be a reason to have a unit test for it

* add clarifying comments to the goroutine response collection code

* test: add regression tests for issue #671 (send_interrupt early exit optimization)

Add three comprehensive regression tests to verify the fix in commit 4d26204:

1. Test_killCmd_SendInterrupt_FastGracefulExit
   - Verifies processes that exit quickly on SIGINT return immediately
   - Saves ~2s when process exits in <1ms vs 2s kill_delay

2. Test_killCmd_SendInterrupt_IgnoresSIGINT
   - Verifies processes ignoring SIGINT still get SIGKILL after kill_delay
   - Ensures optimization doesn't break fallback behavior

3. Test_killCmd_SendInterrupt_SlowGracefulExit
   - Verifies processes that take time to cleanup still benefit
   - Saves ~700ms when process exits in 300ms vs 1s kill_delay

These tests ensure the goroutine-based optimization continues to work
correctly and prevent future regressions.

Related: https://github.com/air-verse/air/issues/671

---------

Co-authored-by: xiantang <zhujingdi1998@gmail.com>
This commit is contained in:
Isak Styf
2026-01-05 16:13:36 +01:00
committed by GitHub
parent 07de0e2f11
commit 6f43af790b
3 changed files with 223 additions and 45 deletions
+34 -11
View File
@@ -16,20 +16,43 @@ import (
func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
pid = cmd.Process.Pid
if e.config.Build.SendInterrupt {
// Sending a signal to make it clear to the process that it is time to turn off
if err = sendSignalToProcessTree(pid, syscall.SIGINT); err != nil {
return
}
time.Sleep(e.config.killDelay())
// Start a goroutine to wait for the process to exit
done := make(chan struct{})
go func() {
_, _ = cmd.Process.Wait()
close(done)
}()
// If not using send_interrupt, just kill immediately
if !e.config.Build.SendInterrupt {
e.mainDebug("sending SIGKILL to process tree")
err = sendSignalToProcessTree(pid, syscall.SIGKILL)
<-done // Wait for process to exit
return
}
// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
err = sendSignalToProcessTree(pid, syscall.SIGKILL)
// Send SIGINT first to allow graceful shutdown
e.mainDebug("sending interrupt to process tree")
if err = sendSignalToProcessTree(pid, syscall.SIGINT); err != nil {
return
}
// Wait releases any resources associated with the Process.
_, _ = cmd.Process.Wait()
return
killDelay := e.config.killDelay()
e.mainDebug("waiting up to %s for graceful shutdown", killDelay.String())
// Wait for either the process to exit gracefully or the kill delay to expire
select {
case <-done:
// Process exited gracefully after SIGINT - excellent!
e.mainDebug("process exited gracefully after SIGINT")
return
case <-time.After(killDelay):
// Timeout expired, need to force kill
e.mainDebug("kill delay expired, sending SIGKILL")
err = sendSignalToProcessTree(pid, syscall.SIGKILL)
<-done // Wait for process to exit after SIGKILL
return
}
}
func (e *Engine) startCmd(cmd string) (*exec.Cmd, io.ReadCloser, io.ReadCloser, error) {
+151 -23
View File
@@ -1,7 +1,6 @@
package runner
import (
"errors"
"fmt"
"os"
"os/exec"
@@ -10,7 +9,6 @@ import (
"runtime"
"strconv"
"strings"
"syscall"
"testing"
"time"
@@ -201,27 +199,6 @@ func TestAdaptToVariousPlatforms(t *testing.T) {
}
}
func Test_killCmd_no_process(t *testing.T) {
e := Engine{
config: &Config{
Build: cfgBuild{
SendInterrupt: false,
},
},
}
_, err := e.killCmd(&exec.Cmd{
Process: &os.Process{
Pid: 9999,
},
})
if err == nil {
t.Errorf("expected error but got none")
}
if !errors.Is(err, syscall.ESRCH) {
t.Errorf("expected '%s' but got '%s'", syscall.ESRCH, errors.Unwrap(err))
}
}
func Test_killCmd_SendInterrupt_false(t *testing.T) {
_, b, _, _ := runtime.Caller(0)
@@ -611,3 +588,154 @@ func TestFormatPath(t *testing.T) {
runTests(t, tests)
})
}
// Test_killCmd_SendInterrupt_FastGracefulExit is a regression test for issue #671.
// It verifies that when a process exits quickly after receiving SIGINT, Air detects
// this and returns immediately instead of waiting the full kill_delay.
//
// This optimization was implemented in commit 4d26204 (2024-11-01) by Isak Styf.
// Before the fix, Air would always sleep for the full kill_delay (wasting time).
// After the fix, Air uses goroutines to detect process exit and returns early.
//
// Related: https://github.com/air-verse/air/issues/671
func Test_killCmd_SendInterrupt_FastGracefulExit(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("send_interrupt not supported on windows")
}
e := Engine{
config: &Config{
Build: cfgBuild{
SendInterrupt: true,
KillDelay: 2 * time.Second, // Set high to make the waste observable
},
},
}
// Process that exits immediately on SIGINT
// trap "exit 0" INT means: exit cleanly when receiving SIGINT
// sleep 100 means: if no signal, run for 100 seconds
cmd, _, _, err := e.startCmd(`sh -c 'trap "exit 0" INT; sleep 100'`)
require.NoError(t, err, "failed to start command")
// Give the process a moment to start and set up the signal handler
time.Sleep(100 * time.Millisecond)
start := time.Now()
pid, err := e.killCmd(cmd)
elapsed := time.Since(start)
require.NoError(t, err, "killCmd should succeed")
assert.Positive(t, pid, "should return valid pid")
// Core assertion: should complete in much less than 2 seconds
// Process exits immediately on SIGINT, so should finish in <500ms
// With the fix (commit 4d26204), this should PASS
// Without the fix, this would FAIL (would take 2+ seconds)
assert.Less(t, elapsed, 500*time.Millisecond,
"killCmd should return quickly when process exits gracefully on SIGINT, "+
"but took %v (expected < 500ms). Regression of issue #671!",
elapsed)
t.Logf("✅ PASS: Process exited gracefully in %v (kill_delay was 2s, saved ~%.1fs)",
elapsed, 2.0-elapsed.Seconds())
}
// Test_killCmd_SendInterrupt_IgnoresSIGINT is a regression test for issue #671.
// It verifies that processes which ignore SIGINT are still killed with SIGKILL
// after kill_delay. This ensures the optimization (commit 4d26204) doesn't break
// the fallback behavior for misbehaving processes.
//
// Related: https://github.com/air-verse/air/issues/671
func Test_killCmd_SendInterrupt_IgnoresSIGINT(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("send_interrupt not supported on windows")
}
e := Engine{
config: &Config{
Build: cfgBuild{
SendInterrupt: true,
KillDelay: 500 * time.Millisecond,
},
},
}
// Process that ignores SIGINT
// trap "" INT means: ignore SIGINT signal
cmd, _, _, err := e.startCmd(`sh -c 'trap "" INT; sleep 100'`)
require.NoError(t, err, "failed to start command")
// Give the process a moment to start and set up the signal handler
time.Sleep(100 * time.Millisecond)
start := time.Now()
pid, err := e.killCmd(cmd)
elapsed := time.Since(start)
require.NoError(t, err, "killCmd should succeed")
assert.Positive(t, pid, "should return valid pid")
// Should wait at least kill_delay before sending SIGKILL
assert.GreaterOrEqual(t, elapsed, 500*time.Millisecond,
"killCmd should wait at least kill_delay when process ignores SIGINT")
// But should not wait too long after SIGKILL
assert.Less(t, elapsed, 1*time.Second,
"killCmd should not wait too long after sending SIGKILL, "+
"but took %v", elapsed)
t.Logf("✅ PASS: Process ignored SIGINT, killed with SIGKILL after %v (expected behavior)", elapsed)
}
// Test_killCmd_SendInterrupt_SlowGracefulExit is a regression test for issue #671.
// It verifies that when a process takes some time to clean up after receiving
// SIGINT but still exits within kill_delay, Air returns as soon as the process exits
// (not waiting the full kill_delay).
//
// This optimization was implemented in commit 4d26204 (2024-11-01).
// Related: https://github.com/air-verse/air/issues/671
func Test_killCmd_SendInterrupt_SlowGracefulExit(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("send_interrupt not supported on windows")
}
e := Engine{
config: &Config{
Build: cfgBuild{
SendInterrupt: true,
KillDelay: 1 * time.Second,
},
},
}
// Process that takes 300ms to exit after SIGINT (simulating cleanup)
// trap "sleep 0.3; exit 0" INT means: when SIGINT received, sleep 300ms then exit
cmd, _, _, err := e.startCmd(`sh -c 'trap "sleep 0.3; exit 0" INT; sleep 100'`)
require.NoError(t, err, "failed to start command")
// Give the process a moment to start and set up the signal handler
time.Sleep(100 * time.Millisecond)
start := time.Now()
pid, err := e.killCmd(cmd)
elapsed := time.Since(start)
require.NoError(t, err, "killCmd should succeed")
assert.Positive(t, pid, "should return valid pid")
// Should wait at least for the process cleanup time (~300ms)
assert.Greater(t, elapsed, 250*time.Millisecond,
"should wait at least for process cleanup time (~300ms)")
// Should return shortly after process exits (~300-500ms)
// With the fix (commit 4d26204), this should PASS
// Without the fix, this would FAIL (would take 1+ seconds)
assert.Less(t, elapsed, 600*time.Millisecond,
"killCmd should return soon after process exits, "+
"but took %v (expected ~300-500ms). Regression of issue #671!",
elapsed)
t.Logf("✅ PASS: Process exited gracefully in %v after cleanup (kill_delay was 1s, saved ~%.1fs)",
elapsed, 1.0-elapsed.Seconds())
}
+38 -11
View File
@@ -13,18 +13,45 @@ import (
func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
pid = cmd.Process.Pid
if e.config.Build.SendInterrupt {
// Sending a signal to make it clear to the process that it is time to turn off
if err = syscall.Kill(-pid, syscall.SIGINT); err != nil {
return
}
time.Sleep(e.config.killDelay())
// Start a goroutine to wait for the process to exit
done := make(chan struct{})
go func() {
_, _ = cmd.Process.Wait()
close(done)
}()
// If not using send_interrupt, just kill immediately
if !e.config.Build.SendInterrupt {
e.mainDebug("sending SIGKILL to process %d", pid)
// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
err = syscall.Kill(-pid, syscall.SIGKILL)
<-done // Wait for process to exit
return
}
// Send SIGINT first to allow graceful shutdown
e.mainDebug("sending interrupt to process %d", pid)
if err = syscall.Kill(-pid, syscall.SIGINT); err != nil {
return
}
killDelay := e.config.killDelay()
e.mainDebug("waiting up to %s for graceful shutdown", killDelay.String())
// Wait for either the process to exit gracefully or the kill delay to expire
select {
case <-done:
// Process exited gracefully after SIGINT - excellent!
e.mainDebug("process exited gracefully after SIGINT")
return
case <-time.After(killDelay):
// Timeout expired, need to force kill
e.mainDebug("kill delay expired, sending SIGKILL")
// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
err = syscall.Kill(-pid, syscall.SIGKILL)
<-done // Wait for process to exit after SIGKILL
return
}
// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
err = syscall.Kill(-pid, syscall.SIGKILL)
// Wait releases any resources associated with the Process.
_, _ = cmd.Process.Wait()
return pid, err
}
func (e *Engine) startCmd(cmd string) (*exec.Cmd, io.ReadCloser, io.ReadCloser, error) {