Oct 18, 2018 11:53:49 AM jenkins.metrics.api.Metrics$HealthChecker executeOct 18, 2018 11:53:49 AM jenkins.metrics.api.Metrics$HealthChecker executeWARNING: Some health checks are reporting as unhealthy: [thread-deadlock : [jenkins.util.Timer [#7] locked on hudson.plugins.ec2.AmazonEC2Cloud@5a66924 (owned by jenkins.util.Timer [#10]): at hudson.plugins.ec2.EC2Cloud.connect(EC2Cloud.java:687) at hudson.plugins.ec2.CloudHelper.getInstance(CloudHelper.java:47) at hudson.plugins.ec2.CloudHelper.getInstanceWithRetry(CloudHelper.java:25) at hudson.plugins.ec2.EC2Computer.getState(EC2Computer.java:127) at hudson.plugins.ec2.EC2RetentionStrategy.internalCheck(EC2RetentionStrategy.java:112) at hudson.plugins.ec2.EC2RetentionStrategy.check(EC2RetentionStrategy.java:90) at hudson.plugins.ec2.EC2RetentionStrategy.check(EC2RetentionStrategy.java:48) at hudson.slaves.ComputerRetentionWork$1.run(ComputerRetentionWork.java:72) at hudson.model.Queue._withLock(Queue.java:1380) at hudson.model.Queue.withLock(Queue.java:1257) at hudson.slaves.ComputerRetentionWork.doRun(ComputerRetentionWork.java:63) at hudson.triggers.SafeTimerTask.run(SafeTimerTask.java:72) at jenkins.security.ImpersonatingScheduledExecutorService$1.run(ImpersonatingScheduledExecutorService.java:58) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748), Computer.threadPoolForRemoting [#34] locked on hudson.plugins.ec2.AmazonEC2Cloud@5a66924 (owned by jenkins.util.Timer [#10]): at hudson.plugins.ec2.EC2Cloud.connect(EC2Cloud.java:687) at hudson.plugins.ec2.CloudHelper.getInstance(CloudHelper.java:47) at hudson.plugins.ec2.CloudHelper.getInstanceWithRetry(CloudHelper.java:25) at hudson.plugins.ec2.EC2Computer.updateInstanceDescription(EC2Computer.java:117) at hudson.plugins.ec2.ssh.EC2UnixLauncher.getEC2HostAddress(EC2UnixLauncher.java:365) at hudson.plugins.ec2.ssh.EC2UnixLauncher.connectToSsh(EC2UnixLauncher.java:319) at hudson.plugins.ec2.ssh.EC2UnixLauncher.bootstrap(EC2UnixLauncher.java:283) at hudson.plugins.ec2.ssh.EC2UnixLauncher.launchScript(EC2UnixLauncher.java:130) at hudson.plugins.ec2.EC2ComputerLauncher.launch(EC2ComputerLauncher.java:48) at hudson.slaves.SlaveComputer$1.call(SlaveComputer.java:294) at jenkins.util.ContextResettingExecutorService$2.call(ContextResettingExecutorService.java:46) at jenkins.security.ImpersonatingExecutorService$2.call(ImpersonatingExecutorService.java:71) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748), jenkins.util.Timer [#10] locked on java.util.concurrent.locks.ReentrantLock$NonfairSync@779d1b9d (owned by jenkins.util.Timer [#7]): at sun.misc.Unsafe.park(Native Method) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836) at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:870) at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199) at java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(ReentrantLock.java:209) at java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:285) at hudson.model.Queue._withLock(Queue.java:1437) at hudson.model.Queue.withLock(Queue.java:1300) at jenkins.model.Nodes.updateNode(Nodes.java:193) at jenkins.model.Jenkins.updateNode(Jenkins.java:2080) at hudson.model.Node.save(Node.java:140) at hudson.util.PersistedList.onModified(PersistedList.java:173) at hudson.util.PersistedList.replaceBy(PersistedList.java:85) at hudson.model.Slave.<init>(Slave.java:198) at hudson.plugins.ec2.EC2AbstractSlave.<init>(EC2AbstractSlave.java:136) at hudson.plugins.ec2.EC2OndemandSlave.<init>(EC2OndemandSlave.java:49) at hudson.plugins.ec2.EC2OndemandSlave.<init>(EC2OndemandSlave.java:42) at hudson.plugins.ec2.SlaveTemplate.newOndemandSlave(SlaveTemplate.java:954) at hudson.plugins.ec2.SlaveTemplate.toSlaves(SlaveTemplate.java:659) at hudson.plugins.ec2.SlaveTemplate.provisionOndemand(SlaveTemplate.java:631) at hudson.plugins.ec2.SlaveTemplate.provision(SlaveTemplate.java:462) at hudson.plugins.ec2.EC2Cloud.getNewOrExistingAvailableSlave(EC2Cloud.java:548) at hudson.plugins.ec2.EC2Cloud.provision(EC2Cloud.java:563) at hudson.slaves.NodeProvisioner$StandardStrategyImpl.apply(NodeProvisioner.java:715) at hudson.slaves.NodeProvisioner.update(NodeProvisioner.java:320) at hudson.slaves.NodeProvisioner.access$000(NodeProvisioner.java:61) at hudson.slaves.NodeProvisioner$NodeProvisionerInvoker.doRun(NodeProvisioner.java:809) at hudson.triggers.SafeTimerTask.run(SafeTimerTask.java:72) at jenkins.security.ImpersonatingScheduledExecutorService$1.run(ImpersonatingScheduledExecutorService.java:58) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748)]]
I am testing the attached snapshot now and it just waits for the slaves to be available even though the servers are all up and running.
There is a lot of over provisioning (same as PR252) but it wasn't the case with the releases 1.40.1, even though that version had other bugs.
I will continue to test.