Uploaded image for project: 'Jenkins'
  1. Jenkins
  2. JENKINS-55597

Jenkins JNLP agent disconnects when it fails to delete an old build workspace

XMLWordPrintable

      The agent stays up, but the status in the dashboard keeps changing from online to offline and back. The server-side log at https://X/computer/Y/log shows these messages in a loop:

      JNLP agent connected from Y/Z
      Remoting version: 3.28
      This is a Unix agent
      
      deleting obsolete workspace /home/infinispan/workspace/Openshift_Services_PR-141
      
      java.nio.file.FileSystemException: /home/infinispan/workspace/Openshift_Services_PR-141/openshift.local.clusterup/openshift.local.volumes/pods: Operation not permitted
      	at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91)
      	at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
      	at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
      	at sun.nio.fs.UnixFileAttributeViews$Posix.setMode(UnixFileAttributeViews.java:238)
      	at sun.nio.fs.UnixFileAttributeViews$Posix.setPermissions(UnixFileAttributeViews.java:260)
      	at java.nio.file.Files.setPosixFilePermissions(Files.java:2045)
      	at hudson.Util.makeWritable(Util.java:337)
      	at hudson.Util.tryOnceDeleteFile(Util.java:297)
      	at hudson.Util.deleteFile(Util.java:272)
      Also:   hudson.remoting.Channel$CallSiteStackTrace: Remote call to JNLP4-connect connection from nat-pool-rdu-t.redhat.com/66.187.233.202:45016
      		at hudson.remoting.Channel.attachCallSiteStackTrace(Channel.java:1743)
      		at hudson.remoting.UserRequest$ExceptionResponse.retrieve(UserRequest.java:357)
      		at hudson.remoting.Channel.call(Channel.java:957)
      		at hudson.FilePath.act(FilePath.java:1072)
      		at hudson.FilePath.act(FilePath.java:1061)
      		at hudson.FilePath.deleteRecursive(FilePath.java:1268)
      		at jenkins.branch.WorkspaceLocatorImpl$Collector.onOnline(WorkspaceLocatorImpl.java:534)
      		at hudson.slaves.SlaveComputer.setChannel(SlaveComputer.java:692)
      		at jenkins.slaves.DefaultJnlpSlaveReceiver.afterChannel(DefaultJnlpSlaveReceiver.java:168)
      		at org.jenkinsci.remoting.engine.JnlpConnectionState$4.invoke(JnlpConnectionState.java:421)
      		at org.jenkinsci.remoting.engine.JnlpConnectionState.fire(JnlpConnectionState.java:312)
      		at org.jenkinsci.remoting.engine.JnlpConnectionState.fireAfterChannel(JnlpConnectionState.java:418)
      		at org.jenkinsci.remoting.engine.JnlpProtocol4Handler$Handler$1.run(JnlpProtocol4Handler.java:334)
      		at jenkins.util.ContextResettingExecutorService$1.run(ContextResettingExecutorService.java:28)
      		at jenkins.security.ImpersonatingExecutorService$1.run(ImpersonatingExecutorService.java:59)
      		at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
      		at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
      Caused: java.io.IOException: Unable to delete '/home/infinispan/workspace/Openshift_Services_PR-141/openshift.local.clusterup/openshift.local.volumes/pods'. Tried 3 times (of a maximum of 3) waiting 0.1 sec between attempts.
      	at hudson.Util.deleteFile(Util.java:277)
      	at hudson.FilePath.deleteRecursive(FilePath.java:1305)
      	at hudson.FilePath.deleteContentsRecursive(FilePath.java:1314)
      	at hudson.FilePath.deleteRecursive(FilePath.java:1296)
      	at hudson.FilePath.deleteContentsRecursive(FilePath.java:1314)
      	at hudson.FilePath.deleteRecursive(FilePath.java:1296)
      	at hudson.FilePath.deleteContentsRecursive(FilePath.java:1314)
      	at hudson.FilePath.deleteRecursive(FilePath.java:1296)
      	at hudson.FilePath.access$1600(FilePath.java:213)
      	at hudson.FilePath$DeleteRecursive.invoke(FilePath.java:1274)
      	at hudson.FilePath$DeleteRecursive.invoke(FilePath.java:1270)
      	at hudson.FilePath$FileCallableWrapper.call(FilePath.java:3086)
      	at hudson.remoting.UserRequest.perform(UserRequest.java:212)
      	at hudson.remoting.UserRequest.perform(UserRequest.java:54)
      	at hudson.remoting.Request$2.run(Request.java:369)
      	at hudson.remoting.InterceptingExecutorService$1.call(InterceptingExecutorService.java:72)
      	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
      	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
      	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
      	at hudson.remoting.Engine$1.lambda$newThread$0(Engine.java:93)
      	at java.lang.Thread.run(Thread.java:748)
      ERROR: Connection terminated
      java.nio.channels.ClosedChannelException
      	at org.jenkinsci.remoting.protocol.impl.ChannelApplicationLayer.onReadClosed(ChannelApplicationLayer.java:209)
      	at org.jenkinsci.remoting.protocol.ApplicationLayer.onRecvClosed(ApplicationLayer.java:222)
      	at org.jenkinsci.remoting.protocol.ProtocolStack$Ptr.onRecvClosed(ProtocolStack.java:816)
      	at org.jenkinsci.remoting.protocol.FilterLayer.onRecvClosed(FilterLayer.java:287)
      	at org.jenkinsci.remoting.protocol.impl.SSLEngineFilterLayer.onRecvClosed(SSLEngineFilterLayer.java:181)
      	at org.jenkinsci.remoting.protocol.impl.SSLEngineFilterLayer.switchToNoSecure(SSLEngineFilterLayer.java:283)
      	at org.jenkinsci.remoting.protocol.impl.SSLEngineFilterLayer.processWrite(SSLEngineFilterLayer.java:503)
      	at org.jenkinsci.remoting.protocol.impl.SSLEngineFilterLayer.processQueuedWrites(SSLEngineFilterLayer.java:248)
      	at org.jenkinsci.remoting.protocol.impl.SSLEngineFilterLayer.doSend(SSLEngineFilterLayer.java:200)
      	at org.jenkinsci.remoting.protocol.impl.SSLEngineFilterLayer.doCloseSend(SSLEngineFilterLayer.java:213)
      	at org.jenkinsci.remoting.protocol.ProtocolStack$Ptr.doCloseSend(ProtocolStack.java:784)
      	at org.jenkinsci.remoting.protocol.ApplicationLayer.doCloseWrite(ApplicationLayer.java:173)
      	at org.jenkinsci.remoting.protocol.impl.ChannelApplicationLayer$ByteBufferCommandTransport.closeWrite(ChannelApplicationLayer.java:314)
      	at hudson.remoting.Channel.close(Channel.java:1452)
      	at hudson.remoting.Channel.close(Channel.java:1405)
      	at jenkins.slaves.DefaultJnlpSlaveReceiver.afterChannel(DefaultJnlpSlaveReceiver.java:173)
      	at org.jenkinsci.remoting.engine.JnlpConnectionState$4.invoke(JnlpConnectionState.java:421)
      	at org.jenkinsci.remoting.engine.JnlpConnectionState.fire(JnlpConnectionState.java:312)
      	at org.jenkinsci.remoting.engine.JnlpConnectionState.fireAfterChannel(JnlpConnectionState.java:418)
      	at org.jenkinsci.remoting.engine.JnlpProtocol4Handler$Handler$1.run(JnlpProtocol4Handler.java:334)
      	at jenkins.util.ContextResettingExecutorService$1.run(ContextResettingExecutorService.java:28)
      	at jenkins.security.ImpersonatingExecutorService$1.run(ImpersonatingExecutorService.java:59)
      	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
      	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
      	at java.lang.Thread.run(Thread.java:748)
      

      The agent log doesn't include any of this, even after I configured j.u.l with `.level = ALL`.

      IMO failing to delete a build directory is not a fatal error, it should only affect builds that try to use exactly the same directory.

       

      Additionally, while the node is online, it may start to run a build, but then it immediately goes offline and the build fails with this error:

      Error when executing always post condition:
      org.jenkinsci.plugins.workflow.steps.MissingContextVariableException: Required context class hudson.FilePath is missing
      Perhaps you forgot to surround the code with a step that provides this, such as: node
      	at org.jenkinsci.plugins.workflow.steps.StepDescriptor.checkContextAvailability(StepDescriptor.java:261)
      	at org.jenkinsci.plugins.workflow.cps.DSL.invokeStep(DSL.java:260)
      	at org.jenkinsci.plugins.workflow.cps.DSL.invokeMethod(DSL.java:176)
      	at org.jenkinsci.plugins.workflow.cps.CpsScript.invokeMethod(CpsScript.java:122)
      	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
      	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
      	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
      	at java.lang.reflect.Method.invoke(Method.java:498)
      	at org.codehaus.groovy.reflection.CachedMethod.invoke(CachedMethod.java:93)
      	at groovy.lang.MetaMethod.doMethodInvoke(MetaMethod.java:325)
      	at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1213)
      	at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1022)
      	at org.codehaus.groovy.runtime.callsite.PogoMetaClassSite.call(PogoMetaClassSite.java:42)
      	at org.codehaus.groovy.runtime.callsite.CallSiteArray.defaultCall(CallSiteArray.java:48)
      	at org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:113)
      	at org.kohsuke.groovy.sandbox.impl.Checker$1.call(Checker.java:157)
      	at org.kohsuke.groovy.sandbox.GroovyInterceptor.onMethodCall(GroovyInterceptor.java:23)
      	at org.jenkinsci.plugins.scriptsecurity.sandbox.groovy.SandboxInterceptor.onMethodCall(SandboxInterceptor.java:155)
      	at org.kohsuke.groovy.sandbox.impl.Checker$1.call(Checker.java:155)
      	at org.kohsuke.groovy.sandbox.impl.Checker.checkedCall(Checker.java:159)
      	at org.kohsuke.groovy.sandbox.impl.Checker.checkedCall(Checker.java:129)
      	at com.cloudbees.groovy.cps.sandbox.SandboxInvoker.methodCall(SandboxInvoker.java:17)
      	at WorkflowScript.run(WorkflowScript:79)
      	at org.jenkinsci.plugins.pipeline.modeldefinition.ModelInterpreter.delegateAndExecute(jar:file:/var/lib/jenkins/plugins/pipeline-model-definition/WEB-INF/lib/pipeline-model-definition.jar!/org/jenkinsci/plugins/pipeline/modeldefinition/ModelInterpreter.groovy:133)
      	at org.jenkinsci.plugins.pipeline.modeldefinition.ModelInterpreter.runPostConditions(jar:file:/var/lib/jenkins/plugins/pipeline-model-definition/WEB-INF/lib/pipeline-model-definition.jar!/org/jenkinsci/plugins/pipeline/modeldefinition/ModelInterpreter.groovy:789)
      	at org.jenkinsci.plugins.pipeline.modeldefinition.ModelInterpreter.catchRequiredContextForNode(jar:file:/var/lib/jenkins/plugins/pipeline-model-definition/WEB-INF/lib/pipeline-model-definition.jar!/org/jenkinsci/plugins/pipeline/modeldefinition/ModelInterpreter.groovy:418)
      	at org.jenkinsci.plugins.pipeline.modeldefinition.ModelInterpreter.catchRequiredContextForNode(jar:file:/var/lib/jenkins/plugins/pipeline-model-definition/WEB-INF/lib/pipeline-model-definition.jar!/org/jenkinsci/plugins/pipeline/modeldefinition/ModelInterpreter.groovy:416)
      	at org.jenkinsci.plugins.pipeline.modeldefinition.ModelInterpreter.runPostConditions(jar:file:/var/lib/jenkins/plugins/pipeline-model-definition/WEB-INF/lib/pipeline-model-definition.jar!/org/jenkinsci/plugins/pipeline/modeldefinition/ModelInterpreter.groovy:788)
      	at com.cloudbees.groovy.cps.CpsDefaultGroovyMethods.each(CpsDefaultGroovyMethods:2030)
      	at com.cloudbees.groovy.cps.CpsDefaultGroovyMethods.each(CpsDefaultGroovyMethods:2015)
      	at com.cloudbees.groovy.cps.CpsDefaultGroovyMethods.each(CpsDefaultGroovyMethods:2056)
      	at org.jenkinsci.plugins.pipeline.modeldefinition.ModelInterpreter.runPostConditions(jar:file:/var/lib/jenkins/plugins/pipeline-model-definition/WEB-INF/lib/pipeline-model-definition.jar!/org/jenkinsci/plugins/pipeline/modeldefinition/ModelInterpreter.groovy:783)
      	at org.jenkinsci.plugins.pipeline.modeldefinition.ModelInterpreter.executePostBuild(jar:file:/var/lib/jenkins/plugins/pipeline-model-definition/WEB-INF/lib/pipeline-model-definition.jar!/org/jenkinsci/plugins/pipeline/modeldefinition/ModelInterpreter.groovy:760)
      	at ___cps.transform___(Native Method)
      	at com.cloudbees.groovy.cps.impl.ContinuationGroup.methodCall(ContinuationGroup.java:57)
      	at com.cloudbees.groovy.cps.impl.FunctionCallBlock$ContinuationImpl.dispatchOrArg(FunctionCallBlock.java:109)
      	at com.cloudbees.groovy.cps.impl.FunctionCallBlock$ContinuationImpl.fixArg(FunctionCallBlock.java:82)
      	at sun.reflect.GeneratedMethodAccessor327.invoke(Unknown Source)
      	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
      	at java.lang.reflect.Method.invoke(Method.java:498)
      	at com.cloudbees.groovy.cps.impl.ContinuationPtr$ContinuationImpl.receive(ContinuationPtr.java:72)
      	at com.cloudbees.groovy.cps.impl.ConstantBlock.eval(ConstantBlock.java:21)
      	at com.cloudbees.groovy.cps.Next.step(Next.java:83)
      	at com.cloudbees.groovy.cps.Continuable$1.call(Continuable.java:174)
      	at com.cloudbees.groovy.cps.Continuable$1.call(Continuable.java:163)
      	at org.codehaus.groovy.runtime.GroovyCategorySupport$ThreadCategoryInfo.use(GroovyCategorySupport.java:129)
      	at org.codehaus.groovy.runtime.GroovyCategorySupport.use(GroovyCategorySupport.java:268)
      	at com.cloudbees.groovy.cps.Continuable.run0(Continuable.java:163)
      	at org.jenkinsci.plugins.workflow.cps.SandboxContinuable.access$101(SandboxContinuable.java:34)
      	at org.jenkinsci.plugins.workflow.cps.SandboxContinuable.lambda$run0$0(SandboxContinuable.java:59)
      	at org.jenkinsci.plugins.scriptsecurity.sandbox.groovy.GroovySandbox.runInSandbox(GroovySandbox.java:121)
      	at org.jenkinsci.plugins.workflow.cps.SandboxContinuable.run0(SandboxContinuable.java:58)
      	at org.jenkinsci.plugins.workflow.cps.CpsThread.runNextChunk(CpsThread.java:182)
      	at org.jenkinsci.plugins.workflow.cps.CpsThreadGroup.run(CpsThreadGroup.java:332)
      	at org.jenkinsci.plugins.workflow.cps.CpsThreadGroup.access$200(CpsThreadGroup.java:83)
      	at org.jenkinsci.plugins.workflow.cps.CpsThreadGroup$2.call(CpsThreadGroup.java:244)
      	at org.jenkinsci.plugins.workflow.cps.CpsThreadGroup$2.call(CpsThreadGroup.java:232)
      	at org.jenkinsci.plugins.workflow.cps.CpsVmExecutorService$2.call(CpsVmExecutorService.java:64)
      

      It would be great if the error message would say explicitly that the build is failing because the node was disconnected, and if the agent-side logs included the error as well.

            Unassigned Unassigned
            danberindei Dan Berindei
            Votes:
            4 Vote for this issue
            Watchers:
            8 Start watching this issue

              Created:
              Updated:
              Resolved: