-
Bug
-
Resolution: Unresolved
-
Major
-
None
-
ec2 plugin version 2.0.3
When there is large number of idle nodes (500+) and the nodes have idle termination timeout set, we end up with lock contention on the Queue lock due to threads spending ~100ms each calling ec2.describeInstances, so at most we can process 600 nodes per check interval assuming AWS doesnt throttle
EC2 fleet plugin had the same issue and resolved it by making the node clean up process async back in 2019, not sure if others have seen it here: https://github.com/jenkinsci/ec2-fleet-plugin/issues/55
The easier fix here is probably making the check interval currently hard coded at 1 minute configurable
In the short term we are thinking of removing idle termination timeout and writing our own node clean up logic which seems backwards.
Stack trace
"Computer.threadPoolForRemoting 305466" Id=21033600 Group=main RUNNABLE at java.base@17.0.8.1/sun.nio.ch.Net.poll(Native Method) at java.base@17.0.8.1/sun.nio.ch.NioSocketImpl.park(NioSocketImpl.java:186) at java.base@17.0.8.1/sun.nio.ch.NioSocketImpl.timedRead(NioSocketImpl.java:290) at java.base@17.0.8.1/sun.nio.ch.NioSocketImpl.implRead(NioSocketImpl.java:314) at java.base@17.0.8.1/sun.nio.ch.NioSocketImpl.read(NioSocketImpl.java:355) at java.base@17.0.8.1/sun.nio.ch.NioSocketImpl$1.read(NioSocketImpl.java:808) at java.base@17.0.8.1/java.net.Socket$SocketInputStream.read(Socket.java:966) at java.base@17.0.8.1/sun.security.ssl.SSLSocketInputRecord.read(SSLSocketInputRecord.java:484) at java.base@17.0.8.1/sun.security.ssl.SSLSocketInputRecord.readHeader(SSLSocketInputRecord.java:478) at java.base@17.0.8.1/sun.security.ssl.SSLSocketInputRecord.bytesInCompletePacket(SSLSocketInputRecord.java:70) at java.base@17.0.8.1/sun.security.ssl.SSLSocketImpl.readApplicationRecord(SSLSocketImpl.java:1465) at java.base@17.0.8.1/sun.security.ssl.SSLSocketImpl$AppInputStream.read(SSLSocketImpl.java:1069) at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:137) at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:153) at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:280) at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:138) at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:56) at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:259) at org.apache.http.impl.DefaultBHttpClientConnection.receiveResponseHeader(DefaultBHttpClientConnection.java:163) at org.apache.http.impl.conn.CPoolProxy.receiveResponseHeader(CPoolProxy.java:157) at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:273) at com.amazonaws.http.protocol.SdkHttpRequestExecutor.doReceiveResponse(SdkHttpRequestExecutor.java:82) at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:125) at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:272) at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:186) at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185) at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83) at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56) at com.amazonaws.http.apache.client.impl.SdkHttpClient.execute(SdkHttpClient.java:72) at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1346) at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1157) at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:814) at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:781) at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:755) at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:715) at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:697) at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:561) at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:541) at com.amazonaws.services.ec2.AmazonEC2Client.doInvoke(AmazonEC2Client.java:34698) at com.amazonaws.services.ec2.AmazonEC2Client.invoke(AmazonEC2Client.java:34665) at com.amazonaws.services.ec2.AmazonEC2Client.invoke(AmazonEC2Client.java:34654) at com.amazonaws.services.ec2.AmazonEC2Client.executeDescribeInstances(AmazonEC2Client.java:15804) at com.amazonaws.services.ec2.AmazonEC2Client.describeInstances(AmazonEC2Client.java:15772) at hudson.plugins.ec2.CloudHelper.getInstance(CloudHelper.java:54) at hudson.plugins.ec2.CloudHelper.getInstanceWithRetry(CloudHelper.java:32) at hudson.plugins.ec2.EC2Computer.getState(EC2Computer.java:187) at hudson.plugins.ec2.EC2RetentionStrategy.internalCheck(EC2RetentionStrategy.java:148) at hudson.plugins.ec2.EC2RetentionStrategy.check(EC2RetentionStrategy.java:108) at hudson.plugins.ec2.EC2RetentionStrategy.check(EC2RetentionStrategy.java:53) at hudson.slaves.SlaveComputer$3.run(SlaveComputer.java:960) at hudson.model.Queue._withLock(Queue.java:1397) at hudson.model.Queue.withLock(Queue.java:1271) at hudson.slaves.SlaveComputer.setNode(SlaveComputer.java:957) at hudson.model.AbstractCIBase.updateComputer(AbstractCIBase.java:147) at hudson.model.AbstractCIBase$1.run(AbstractCIBase.java:255) at hudson.model.Queue._withLock(Queue.java:1397) at hudson.model.Queue.withLock(Queue.java:1271) at hudson.model.AbstractCIBase.updateComputerList(AbstractCIBase.java:238) at jenkins.model.Jenkins.updateComputerList(Jenkins.java:1693) at jenkins.model.Nodes$5.run(Nodes.java:279) at hudson.model.Queue._withLock(Queue.java:1397) at hudson.model.Queue.withLock(Queue.java:1271) at jenkins.model.Nodes.removeNode(Nodes.java:270) at jenkins.model.Jenkins.removeNode(Jenkins.java:2238) at hudson.plugins.ec2.EC2OndemandSlave.lambda$terminate$0(EC2OndemandSlave.java:107) at hudson.plugins.ec2.EC2OndemandSlave$$Lambda$1306/0x0000000081e0c600.run(Unknown Source) at jenkins.util.ContextResettingExecutorService$1.run(ContextResettingExecutorService.java:28) at jenkins.security.ImpersonatingExecutorService$1.run(ImpersonatingExecutorService.java:68) at jenkins.util.ErrorLoggingExecutorService.lambda$wrap$0(ErrorLoggingExecutorService.java:51) at jenkins.util.ErrorLoggingExecutorService$$Lambda$831/0x0000000081144cf8.run(Unknown Source) at java.base@17.0.8.1/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:539) at java.base@17.0.8.1/java.util.concurrent.FutureTask.run(FutureTask.java:264) at java.base@17.0.8.1/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) at java.base@17.0.8.1/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) at java.base@17.0.8.1/java.lang.Thread.run(Thread.java:833) Number of locked synchronizers = 5 - java.util.concurrent.locks.ReentrantLock$NonfairSync@30db7a34 - java.util.concurrent.locks.ReentrantLock$NonfairSync@31b189f - java.util.concurrent.ThreadPoolExecutor$Worker@634827be - java.util.concurrent.locks.ReentrantLock$NonfairSync@19074479 - java.util.concurrent.locks.ReentrantLock$NonfairSync@36c81efe