feat(inputs.slurm): Add a SLURM input plugin (#15700)

This commit is contained in:
Pablo Collado 2024-08-15 16:07:46 +02:00 committed by GitHub
parent 371b9887fb
commit 7b5462692b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 1902 additions and 22 deletions

View File

@ -307,6 +307,7 @@ following works:
- github.com/opentracing/opentracing-go [Apache License 2.0](https://github.com/opentracing/opentracing-go/blob/master/LICENSE)
- github.com/p4lang/p4runtime [Apache License 2.0](https://github.com/p4lang/p4runtime/blob/main/LICENSE)
- github.com/pborman/ansi [BSD 3-Clause "New" or "Revised" License](https://github.com/pborman/ansi/blob/master/LICENSE)
- github.com/pcolladosoto/goslurm [MIT License](https://github.com/pcolladosoto/goslurm/blob/main/LICENSE)
- github.com/peterbourgon/unixtransport [Apache License 2.0](https://github.com/peterbourgon/unixtransport/blob/main/LICENSE)
- github.com/philhofer/fwd [MIT License](https://github.com/philhofer/fwd/blob/master/LICENSE.md)
- github.com/pierrec/lz4 [BSD 3-Clause "New" or "Revised" License](https://github.com/pierrec/lz4/blob/master/LICENSE)

17
go.mod
View File

@ -31,7 +31,7 @@ require (
github.com/PaesslerAG/gval v1.2.2
github.com/SAP/go-hdb v1.9.10
github.com/aerospike/aerospike-client-go/v5 v5.11.0
github.com/alecthomas/units v0.0.0-20231202071711-9a357b53e9c9
github.com/alecthomas/units v0.0.0-20240626203959-61d1e3462e30
github.com/alitto/pond v1.9.0
github.com/aliyun/alibaba-cloud-sdk-go v1.62.721
github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9
@ -159,6 +159,7 @@ require (
github.com/openzipkin/zipkin-go v0.4.3
github.com/p4lang/p4runtime v1.3.0
github.com/pborman/ansi v1.0.0
github.com/pcolladosoto/goslurm v0.1.0
github.com/peterbourgon/unixtransport v0.0.4
github.com/pion/dtls/v2 v2.2.12
github.com/prometheus-community/pro-bing v0.4.1
@ -191,7 +192,7 @@ require (
github.com/testcontainers/testcontainers-go v0.32.0
github.com/testcontainers/testcontainers-go/modules/kafka v0.32.0
github.com/thomasklein94/packer-plugin-libvirt v0.5.0
github.com/tidwall/gjson v1.17.0
github.com/tidwall/gjson v1.17.1
github.com/tidwall/wal v1.1.7
github.com/tinylib/msgp v1.2.0
github.com/urfave/cli/v2 v2.27.2
@ -271,11 +272,11 @@ require (
github.com/abbot/go-http-auth v0.4.0 // indirect
github.com/alecthomas/participle v0.4.1 // indirect
github.com/andybalholm/brotli v1.1.0 // indirect
github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
github.com/antlr4-go/antlr/v4 v4.13.1 // indirect
github.com/apache/arrow/go/v15 v15.0.2 // indirect
github.com/aristanetworks/glog v0.0.0-20191112221043-67e8567f59f3 // indirect
github.com/armon/go-metrics v0.4.1 // indirect
github.com/awnumar/memcall v0.2.0 // indirect
github.com/awnumar/memcall v0.3.0 // indirect
github.com/aws/aws-sdk-go v1.53.16 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 // indirect
github.com/aws/aws-sdk-go-v2/feature/dynamodb/attributevalue v1.13.7 // indirect
@ -460,11 +461,11 @@ require (
github.com/signalfx/sapm-proto v0.12.0 // indirect
github.com/spf13/cast v1.6.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/stoewer/go-strcase v1.2.0 // indirect
github.com/stoewer/go-strcase v1.3.0 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.0 // indirect
github.com/tidwall/tinylru v1.1.0 // indirect
github.com/tidwall/pretty v1.2.1 // indirect
github.com/tidwall/tinylru v1.2.1 // indirect
github.com/tklauser/go-sysconf v0.3.13 // indirect
github.com/tklauser/numcpus v0.7.0 // indirect
github.com/twmb/murmur3 v1.1.7 // indirect
@ -496,7 +497,7 @@ require (
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.24.0 // indirect
golang.org/x/exp v0.0.0-20240529005216-23cca8864a10 // indirect
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.23.0 // indirect
golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect

32
go.sum
View File

@ -797,8 +797,8 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuy
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho=
github.com/alecthomas/units v0.0.0-20231202071711-9a357b53e9c9 h1:ez/4by2iGztzR4L0zgAOR8lTQK9VlyBVVd7G4omaOQs=
github.com/alecthomas/units v0.0.0-20231202071711-9a357b53e9c9/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE=
github.com/alecthomas/units v0.0.0-20240626203959-61d1e3462e30 h1:t3eaIm0rUkzbrIewtiFmMK5RXHej2XnoXNhxVsAYUfg=
github.com/alecthomas/units v0.0.0-20240626203959-61d1e3462e30/go.mod h1:fvzegU4vN3H1qMT+8wDmzjAcDONcgo2/SZ/TyfdUOFs=
github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa h1:LHTHcTQiSGT7VVbI0o4wBRNQIgn917usHWOd6VAffYI=
github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4=
github.com/alicebob/gopher-json v0.0.0-20200520072559-a9ecdc9d1d3a/go.mod h1:SGnFV6hVsYE877CKEZ6tDNTjaSXYUk6QqoIK6PrAtcc=
@ -823,8 +823,8 @@ github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwq
github.com/antchfx/xpath v1.3.1 h1:PNbFuUqHwWl0xRjvUPjJ95Agbmdj2uzzIwmQKgu4oCk=
github.com/antchfx/xpath v1.3.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI=
github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ=
github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw=
github.com/apache/arrow/go/v10 v10.0.1/go.mod h1:YvhnlEePVnBS4+0z3fhPfUy7W1Ikj0Ih0vcRo/gZ1M0=
github.com/apache/arrow/go/v11 v11.0.0/go.mod h1:Eg5OsL5H+e299f7u5ssuXsuHQVEGC4xei5aX110hRiI=
github.com/apache/arrow/go/v15 v15.0.2 h1:60IliRbiyTWCWjERBCkO1W4Qun9svcYoZrSLcyOsMLE=
@ -853,8 +853,8 @@ github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj
github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
github.com/awnumar/memcall v0.2.0 h1:sRaogqExTOOkkNwO9pzJsL8jrOV29UuUW7teRMfbqtI=
github.com/awnumar/memcall v0.2.0/go.mod h1:S911igBPR9CThzd/hYQQmTc9SWNu3ZHIlCGaWsWsoJo=
github.com/awnumar/memcall v0.3.0 h1:8b/3Sptrtgejj2kLgL6M5F2r4OzTf19CTllO+gIXUg8=
github.com/awnumar/memcall v0.3.0/go.mod h1:8xOx1YbfyuCg3Fy6TO8DK0kZUua3V42/goA5Ru47E8w=
github.com/awnumar/memguard v0.22.5 h1:PH7sbUVERS5DdXh3+mLo8FDcl1eIeVjJVYMnyuYpvuI=
github.com/awnumar/memguard v0.22.5/go.mod h1:+APmZGThMBWjnMlKiSM1X7MVpbIVewen2MTkqWkA/zE=
github.com/aws/aws-sdk-go v1.19.48/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
@ -2085,6 +2085,8 @@ github.com/pborman/ansi v1.0.0 h1:OqjHMhvlSuCCV5JT07yqPuJPQzQl+WXsiZ14gZsqOrQ=
github.com/pborman/ansi v1.0.0/go.mod h1:SgWzwMAx1X/Ez7i90VqF8LRiQtx52pWDiQP+x3iGnzw=
github.com/pborman/getopt v0.0.0-20190409184431-ee0cd42419d3/go.mod h1:85jBQOZwpVEaDAr341tbn15RS4fCAsIst0qp7i8ex1o=
github.com/pborman/getopt v1.1.0/go.mod h1:FxXoW1Re00sQG/+KIkuSqRL/LwQgSkv7uyac+STFsbk=
github.com/pcolladosoto/goslurm v0.1.0 h1:d2KigvDfsIIeVeHHj/pTtajz2T0cHHqhGk9iJWUdGaM=
github.com/pcolladosoto/goslurm v0.1.0/go.mod h1:eLuBFfN/tj4O/HDMrAJXb+3s3rGhdHQVZFcOUV1Sbbo=
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
github.com/pelletier/go-toml v1.8.1/go.mod h1:T2/BmBdy8dvIRq1a/8aqjN41wvWlN4lrapLU/GW4pbc=
github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
@ -2312,8 +2314,8 @@ github.com/srebhan/cborquery v1.0.1 h1:cFG1falVzmlfyVI8tY6hYM7RQqLxFzt9STusdxHoy
github.com/srebhan/cborquery v1.0.1/go.mod h1:GgsaIoCW+qlqyU+cjSeOpaWhbiiMVkA0uU/H3+PWvjQ=
github.com/srebhan/protobufquery v0.0.0-20230803132024-ae4c0d878e55 h1:ksmbrLbJAm+8yxB7fJ245usD0b1v9JHBJrWF+WqGyjs=
github.com/srebhan/protobufquery v0.0.0-20230803132024-ae4c0d878e55/go.mod h1:SIB3zq5pZq2Ff7aJtCdRpGiHc/meKyMLPEj8F5Tf1j8=
github.com/stoewer/go-strcase v1.2.0 h1:Z2iHWqGXH00XYgqDmNgQbIBxf3wrNq0F3feEy0ainaU=
github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8=
github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs=
github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
@ -2350,14 +2352,16 @@ github.com/testcontainers/testcontainers-go/modules/kafka v0.32.0/go.mod h1:GCPw
github.com/thomasklein94/packer-plugin-libvirt v0.5.0 h1:aj2HLHZZM/ClGLIwVp9rrgh+2TOU/w4EiaZHAwCpOgs=
github.com/thomasklein94/packer-plugin-libvirt v0.5.0/go.mod h1:GwN82FQ6KxCNKtS8LNUgLbwTZs90GGhBzCmTNkrTCrY=
github.com/tidwall/gjson v1.10.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/gjson v1.17.0 h1:/Jocvlh98kcTfpN2+JzGQWQcqrPQwDrVEMApx/M5ZwM=
github.com/tidwall/gjson v1.17.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/gjson v1.17.1 h1:wlYEnwqAHgzmhNUFfw7Xalt2JzQvsMx2Se4PcoFCT/U=
github.com/tidwall/gjson v1.17.1/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/tinylru v1.1.0 h1:XY6IUfzVTU9rpwdhKUF6nQdChgCdGjkMfLzbWyiau6I=
github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/tinylru v1.1.0/go.mod h1:3+bX+TJ2baOLMWTnlyNWHh4QMnFyARg2TLTQ6OFbzw8=
github.com/tidwall/tinylru v1.2.1 h1:VgBr72c2IEr+V+pCdkPZUwiQ0KJknnWIYbhxAVkYfQk=
github.com/tidwall/tinylru v1.2.1/go.mod h1:9bQnEduwB6inr2Y7AkBP7JPgCkyrhTV/ZpX0oOOpBI4=
github.com/tidwall/wal v1.1.7 h1:emc1TRjIVsdKKSnpwGBAcsAGg0767SvUk8+ygx7Bb+4=
github.com/tidwall/wal v1.1.7/go.mod h1:r6lR1j27W9EPalgHiB7zLJDYu3mzW5BQP5KrzBpYY/E=
github.com/tinylib/msgp v1.2.0 h1:0uKB/662twsVBpYUPbokj4sTSKhWFKB7LopO2kWK8lY=
@ -2584,8 +2588,8 @@ golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u0
golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM=
golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU=
golang.org/x/exp v0.0.0-20220827204233-334a2380cb91/go.mod h1:cyybsKvd6eL0RnXn6p/Grxp8F5bW7iYuBgsNCOHpMYE=
golang.org/x/exp v0.0.0-20240529005216-23cca8864a10 h1:vpzMC/iZhYFAjJzHU0Cfuq+w1vLLsF2vLkDrPjzKYck=
golang.org/x/exp v0.0.0-20240529005216-23cca8864a10/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=

View File

@ -0,0 +1,5 @@
//go:build !custom || inputs || inputs.slurm
package all
import _ "github.com/influxdata/telegraf/plugins/inputs/slurm" // register plugin

View File

@ -0,0 +1,197 @@
# SLURM Input Plugin
This plugin gather diag, jobs, nodes, partitions and reservation metrics by
leveraging SLURM's REST API as provided by the `slurmrestd` daemon.
This plugin targets the `openapi/v0.0.38` OpenAPI plugin as defined in SLURM's
documentation. That particular plugin should be configured when starting the
`slurmrestd` daemon up. For more information, be sure to check SLURM's
documentation [here][SLURM Doc].
A great wealth of information can also be found on the repository of the
Go module implementing the API client, [pcolladosoto/goslurm][].
[SLURM Doc]: https://slurm.schedmd.com/rest.html
[pcolladosoto/goslurm]: https://github.com/pcolladosoto/goslurm
## Global configuration options <!-- @/docs/includes/plugin_config.md -->
In addition to the plugin-specific configuration settings, plugins support
additional global and plugin configuration settings. These settings are used to
modify metrics, tags, and field or create aliases and configure ordering, etc.
See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins
## Configuration
```toml @sample.conf
# Gather SLURM metrics
[[inputs.slurm]]
## Slurmrestd URL. Both http and https can be used as schemas.
url = "http://127.0.0.1:6820"
## Credentials for JWT-based authentication.
# username = "foo"
# token = "topSecret"
## Enabled endpoints
## List of endpoints a user can acquire data from.
## Available values are: diag, jobs, nodes, partitions, reservations.
# enabled_endpoints = ["diag", "jobs", "nodes", "partitions", "reservations"]
## Maximum time to receive a response. If set to 0s, the
## request will not time out.
# response_timeout = "5s"
## Optional TLS Config. Note these options will only
## be taken into account when the scheme specififed on
## the URL parameter is https. They will be silently
## ignored otherwise.
## Set to true/false to enforce TLS being enabled/disabled. If not set,
## enable TLS only if any of the other options are specified.
# tls_enable =
## Trusted root certificates for server
# tls_ca = "/path/to/cafile"
## Used for TLS client certificate authentication
# tls_cert = "/path/to/certfile"
## Used for TLS client certificate authentication
# tls_key = "/path/to/keyfile"
## Password for the key file if it is encrypted
# tls_key_pwd = ""
## Send the specified TLS server name via SNI
# tls_server_name = "kubernetes.example.com"
## Minimal TLS version to accept by the client
# tls_min_version = "TLS12"
## List of ciphers to accept, by default all secure ciphers will be accepted
## See https://pkg.go.dev/crypto/tls#pkg-constants for supported values.
## Use "all", "secure" and "insecure" to add all support ciphers, secure
## suites or insecure suites respectively.
# tls_cipher_suites = ["secure"]
## Renegotiation method, "never", "once" or "freely"
# tls_renegotiation_method = "never"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
```
## Metrics
Given the great deal of metrics offered by SLURM's API, an attempt has been
done to strike a balance between verbosity and usefulness in terms of the
gathered information.
- slurm_diag
- tags:
- source
- fields:
- server_thread_count
- jobs_canceled
- jobs_submitted
- jobs_started
- jobs_completed
- jobs_failed
- jobs_pending
- jobs_running
- schedule_cycle_last
- schedule_cycle_mean
- bf_queue_len
- bf_queue_len_mean
- bf_active
- slurm_jobs
- tags:
- source
- name
- job_id
- fields:
- state
- state_reason
- partition
- nodes
- node_count
- priority
- nice
- group_id
- command
- standard_output
- standard_error
- standard_input
- current_working_directory
- submit_time
- start_time
- cpus
- tasks
- time_limit
- tres_cpu
- tres_mem
- tres_node
- tres_billing
- slurm_nodes
- tags:
- source
- name
- fields:
- state
- cores
- cpus
- cpu_load
- alloc_cpu
- real_memory
- free_memory
- alloc_memory
- tres_cpu
- tres_mem
- tres_billing
- tres_used_cpu
- tres_used_mem
- weight
- slurmd_version
- architecture
- slurm_partitions
- tags:
- source
- name
- fields:
- state
- total_cpu
- total_nodes
- nodes
- tres_cpu
- tres_mem
- tres_node
- tres_billing
- slurm_reservations
- tags:
- source
- name
- fields:
- core_count
- core_spec_count
- groups
- users
- start_time
- partition
- accounts
- node_count
- node_list
## Example Output
```text
slurm_diag,host=hoth,source=slurm_primary.example.net bf_active=false,bf_queue_len=1i,bf_queue_len_mean=1i,jobs_canceled=0i,jobs_completed=137i,jobs_failed=0i,jobs_pending=0i,jobs_running=100i,jobs_started=137i,jobs_submitted=137i,schedule_cycle_last=27i,schedule_cycle_mean=86i,server_thread_count=3i 1723466497000000000
slurm_jobs,host=hoth,job_id=23160,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.11BCgQ",cpus=2i,current_working_directory="/home/sessiondir/7CQODmQ3uw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmG9JKDmILUkln",group_id=2005i,nice=50i,node_count=1i,nodes="naboo225",partition="atlas",priority=4294878569i,standard_error="/home/sessiondir/7CQODmQ3uw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmG9JKDmILUkln.comment",standard_input="/dev/null",standard_output="/home/sessiondir/7CQODmQ3uw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmG9JKDmILUkln.comment",start_time=1723354525i,state="RUNNING",state_reason="None",submit_time=1723354525i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=2000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23365,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.yRcFYL",cpus=2i,current_working_directory="/home/sessiondir/LgwNDmTLAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm2BKKDm8bFZsm",group_id=2005i,nice=50i,node_count=1i,nodes="naboo224",partition="atlas",priority=4294878364i,standard_error="/home/sessiondir/LgwNDmTLAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm2BKKDm8bFZsm.comment",standard_input="/dev/null",standard_output="/home/sessiondir/LgwNDmTLAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm2BKKDm8bFZsm.comment",start_time=1723376763i,state="RUNNING",state_reason="None",submit_time=1723376761i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23366,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.5Y9Ngb",cpus=2i,current_working_directory="/home/sessiondir/HFYKDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm3BKKDmiyK3em",group_id=2005i,nice=50i,node_count=1i,nodes="naboo225",partition="atlas",priority=4294878363i,standard_error="/home/sessiondir/HFYKDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm3BKKDmiyK3em.comment",standard_input="/dev/null",standard_output="/home/sessiondir/HFYKDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm3BKKDmiyK3em.comment",start_time=1723376883i,state="RUNNING",state_reason="None",submit_time=1723376882i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23367,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.NmOqMU",cpus=2i,current_working_directory="/home/sessiondir/nnLLDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm4BKKDmfhjFPn",group_id=2005i,nice=50i,node_count=1i,nodes="naboo225",partition="atlas",priority=4294878362i,standard_error="/home/sessiondir/nnLLDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm4BKKDmfhjFPn.comment",standard_input="/dev/null",standard_output="/home/sessiondir/nnLLDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm4BKKDmfhjFPn.comment",start_time=1723376883i,state="RUNNING",state_reason="None",submit_time=1723376882i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23385,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.NNsI08",cpus=2i,current_working_directory="/home/sessiondir/PWvNDmH7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmz7JKDmqgKyRo",group_id=2005i,nice=50i,node_count=1i,nodes="naboo225",partition="atlas",priority=4294878344i,standard_error="/home/sessiondir/PWvNDmH7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmz7JKDmqgKyRo.comment",standard_input="/dev/null",standard_output="/home/sessiondir/PWvNDmH7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmz7JKDmqgKyRo.comment",start_time=1723378725i,state="RUNNING",state_reason="None",submit_time=1723378725i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23386,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.bcmS4h",cpus=2i,current_working_directory="/home/sessiondir/ZNHMDmI7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm27JKDm3Ve66n",group_id=2005i,nice=50i,node_count=1i,nodes="naboo224",partition="atlas",priority=4294878343i,standard_error="/home/sessiondir/ZNHMDmI7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm27JKDm3Ve66n.comment",standard_input="/dev/null",standard_output="/home/sessiondir/ZNHMDmI7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm27JKDm3Ve66n.comment",start_time=1723379206i,state="RUNNING",state_reason="None",submit_time=1723379205i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23387,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.OgpoQZ",cpus=2i,current_working_directory="/home/sessiondir/qohNDmUqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmMCKKDmzM4Yhn",group_id=2005i,nice=50i,node_count=1i,nodes="naboo222",partition="atlas",priority=4294878342i,standard_error="/home/sessiondir/qohNDmUqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmMCKKDmzM4Yhn.comment",standard_input="/dev/null",standard_output="/home/sessiondir/qohNDmUqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmMCKKDmzM4Yhn.comment",start_time=1723379246i,state="RUNNING",state_reason="None",submit_time=1723379245i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23388,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.xYbxSe",cpus=2i,current_working_directory="/home/sessiondir/u9HODmXqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmWCKKDmRlccYn",group_id=2005i,nice=50i,node_count=1i,nodes="naboo224",partition="atlas",priority=4294878341i,standard_error="/home/sessiondir/u9HODmXqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmWCKKDmRlccYn.comment",standard_input="/dev/null",standard_output="/home/sessiondir/u9HODmXqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmWCKKDmRlccYn.comment",start_time=1723379326i,state="RUNNING",state_reason="None",submit_time=1723379326i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23389,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.QHtIIm",cpus=2i,current_working_directory="/home/sessiondir/ZLvKDmYqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXCKKDmjp19km",group_id=2005i,nice=50i,node_count=1i,nodes="naboo227",partition="atlas",priority=4294878340i,standard_error="/home/sessiondir/ZLvKDmYqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXCKKDmjp19km.comment",standard_input="/dev/null",standard_output="/home/sessiondir/ZLvKDmYqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXCKKDmjp19km.comment",start_time=1723379326i,state="RUNNING",state_reason="None",submit_time=1723379326i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23393,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.IH19bN",cpus=2i,current_working_directory="/home/sessiondir/YdPODmVqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmSCKKDmrYDOwm",group_id=2005i,nice=50i,node_count=1i,nodes="naboo224",partition="atlas",priority=4294878336i,standard_error="/home/sessiondir/YdPODmVqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmSCKKDmrYDOwm.comment",standard_input="/dev/null",standard_output="/home/sessiondir/YdPODmVqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmSCKKDmrYDOwm.comment",start_time=1723379767i,state="RUNNING",state_reason="None",submit_time=1723379766i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_nodes,host=hoth,name=naboo145,source=slurm_primary.example.net alloc_cpu=0i,alloc_memory=0i,architecture="x86_64",cores=18i,cpu_load=0i,cpus=36i,free_memory=86450i,real_memory=94791i,slurmd_version="22.05.9",state="idle",tres_billing=36,tres_cpu=36,tres_mem=94791,weight=1i 1723466497000000000
slurm_nodes,host=hoth,name=naboo146,source=slurm_primary.example.net alloc_cpu=0i,alloc_memory=0i,architecture="x86_64",cores=18i,cpu_load=0i,cpus=36i,free_memory=92148i,real_memory=94791i,slurmd_version="22.05.9",state="idle",tres_billing=36,tres_cpu=36,tres_mem=94791,weight=1i 1723466497000000000
slurm_nodes,host=hoth,name=naboo147,source=slurm_primary.example.net alloc_cpu=36i,alloc_memory=45000i,architecture="x86_64",cores=18i,cpu_load=3826i,cpus=36i,free_memory=1607i,real_memory=94793i,slurmd_version="22.05.9",state="allocated",tres_billing=36,tres_cpu=36,tres_mem=94793,tres_used_cpu=36,tres_used_mem=45000,weight=1i 1723466497000000000
slurm_nodes,host=hoth,name=naboo216,source=slurm_primary.example.net alloc_cpu=8i,alloc_memory=8000i,architecture="x86_64",cores=4i,cpu_load=891i,cpus=8i,free_memory=17972i,real_memory=31877i,slurmd_version="22.05.9",state="allocated",tres_billing=8,tres_cpu=8,tres_mem=31877,tres_used_cpu=8,tres_used_mem=8000,weight=1i 1723466497000000000
slurm_nodes,host=hoth,name=naboo219,source=slurm_primary.example.net alloc_cpu=16i,alloc_memory=16000i,architecture="x86_64",cores=4i,cpu_load=1382i,cpus=16i,free_memory=15645i,real_memory=31875i,slurmd_version="22.05.9",state="allocated",tres_billing=16,tres_cpu=16,tres_mem=31875,tres_used_cpu=16,tres_used_mem=16000,weight=1i 1723466497000000000
slurm_partitions,host=hoth,name=atlas,source=slurm_primary.example.net nodes="naboo145,naboo146,naboo147,naboo216,naboo219,naboo222,naboo224,naboo225,naboo227,naboo228,naboo229,naboo234,naboo235,naboo236,naboo237,naboo238,naboo239,naboo240,naboo241,naboo242,naboo243",state="UP",total_cpu=632i,total_nodes=21i,tres_billing=632,tres_cpu=632,tres_mem=1415207,tres_node=21 1723466497000000000
```

View File

@ -0,0 +1,46 @@
# Gather SLURM metrics
[[inputs.slurm]]
## Slurmrestd URL. Both http and https can be used as schemas.
url = "http://127.0.0.1:6820"
## Credentials for JWT-based authentication.
# username = "foo"
# token = "topSecret"
## Enabled endpoints
## List of endpoints a user can acquire data from.
## Available values are: diag, jobs, nodes, partitions, reservations.
# enabled_endpoints = ["diag", "jobs", "nodes", "partitions", "reservations"]
## Maximum time to receive a response. If set to 0s, the
## request will not time out.
# response_timeout = "5s"
## Optional TLS Config. Note these options will only
## be taken into account when the scheme specififed on
## the URL parameter is https. They will be silently
## ignored otherwise.
## Set to true/false to enforce TLS being enabled/disabled. If not set,
## enable TLS only if any of the other options are specified.
# tls_enable =
## Trusted root certificates for server
# tls_ca = "/path/to/cafile"
## Used for TLS client certificate authentication
# tls_cert = "/path/to/certfile"
## Used for TLS client certificate authentication
# tls_key = "/path/to/keyfile"
## Password for the key file if it is encrypted
# tls_key_pwd = ""
## Send the specified TLS server name via SNI
# tls_server_name = "kubernetes.example.com"
## Minimal TLS version to accept by the client
# tls_min_version = "TLS12"
## List of ciphers to accept, by default all secure ciphers will be accepted
## See https://pkg.go.dev/crypto/tls#pkg-constants for supported values.
## Use "all", "secure" and "insecure" to add all support ciphers, secure
## suites or insecure suites respectively.
# tls_cipher_suites = ["secure"]
## Renegotiation method, "never", "once" or "freely"
# tls_renegotiation_method = "never"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false

View File

@ -0,0 +1,23 @@
# Gather SLURM metrics
[[inputs.slurm]]
## Slurmrestd URL. Both http and https can be used as schemas.
url = "http://127.0.0.1:6820"
## Credentials for JWT-based authentication.
# username = "foo"
# token = "topSecret"
## Enabled endpoints
## List of endpoints a user can acquire data from.
## Available values are: diag, jobs, nodes, partitions, reservations.
# enabled_endpoints = ["diag", "jobs", "nodes", "partitions", "reservations"]
## Maximum time to receive a response. If set to 0s, the
## request will not time out.
# response_timeout = "5s"
## Optional TLS Config. Note these options will only
## be taken into account when the scheme specififed on
## the URL parameter is https. They will be silently
## ignored otherwise.
{{template "/plugins/common/tls/client.conf"}}

View File

@ -0,0 +1,476 @@
//go:generate ../../../tools/config_includer/generator
//go:generate ../../../tools/readme_config_includer/generator
package slurm
import (
"context"
_ "embed"
"errors"
"fmt"
"net/http"
"net/url"
"strconv"
"strings"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/common/tls"
"github.com/influxdata/telegraf/plugins/inputs"
goslurm "github.com/pcolladosoto/goslurm/v0038"
)
//go:embed sample.conf
var sampleConfig string
type Slurm struct {
URL string `toml:"url"`
Username string `toml:"username"`
Token string `toml:"token"`
EnabledEndpoints []string `toml:"enabled_endpoints"`
ResponseTimeout config.Duration `toml:"response_timeout"`
Log telegraf.Logger `toml:"-"`
tls.ClientConfig
client *goslurm.APIClient
baseURL *url.URL
endpointMap map[string]bool
}
func (*Slurm) SampleConfig() string {
return sampleConfig
}
func (s *Slurm) Init() error {
if len(s.EnabledEndpoints) == 0 {
s.EnabledEndpoints = []string{"diag", "jobs", "nodes", "partitions", "reservations"}
}
s.endpointMap = make(map[string]bool, len(s.EnabledEndpoints))
for _, endpoint := range s.EnabledEndpoints {
switch e := strings.ToLower(endpoint); e {
case "diag", "jobs", "nodes", "partitions", "reservations":
s.endpointMap[e] = true
default:
return fmt.Errorf("unknown endpoint %q", endpoint)
}
}
if s.URL == "" {
return errors.New("empty URL provided")
}
u, err := url.Parse(s.URL)
if err != nil {
return err
}
if u.Hostname() == "" {
return fmt.Errorf("empty hostname for url %q", s.URL)
}
s.baseURL = u
if u.Scheme != "http" && u.Scheme != "https" {
return fmt.Errorf("invalid scheme %q", u.Scheme)
}
tlsCfg, err := s.ClientConfig.TLSConfig()
if err != nil {
return err
}
if u.Scheme == "http" && tlsCfg != nil {
s.Log.Warn("non-empty TLS configuration for a URL with an http scheme. Ignoring it...")
tlsCfg = nil
}
configuration := goslurm.NewConfiguration()
configuration.Host = u.Host
configuration.Scheme = u.Scheme
configuration.UserAgent = internal.ProductToken()
configuration.HTTPClient = &http.Client{
Transport: &http.Transport{
TLSClientConfig: tlsCfg,
},
Timeout: time.Duration(s.ResponseTimeout),
}
s.client = goslurm.NewAPIClient(configuration)
return nil
}
func (s *Slurm) parseTres(tres string) map[string]interface{} {
tresKVs := strings.Split(tres, ",")
parsedValues := make(map[string]interface{}, len(tresKVs))
for _, tresVal := range tresKVs {
parsedTresVal := strings.Split(tresVal, "=")
if len(parsedTresVal) != 2 {
continue
}
tag := parsedTresVal[0]
val := parsedTresVal[1]
var factor float64 = 1
if tag == "mem" {
var ok bool
factor, ok = map[string]float64{
"K": 1.0 / 1024.0,
"M": 1,
"G": 1024,
"T": 1024 * 1024,
"P": 1024 * 1024 * 1024,
}[strings.ToUpper(val[len(val)-1:])]
if !ok {
continue
}
val = val[:len(val)-1]
}
parsedFloat, err := strconv.ParseFloat(val, 64)
if err == nil {
parsedValues[tag] = parsedFloat * factor
continue
}
parsedValues[tag] = val
}
return parsedValues
}
func (s *Slurm) gatherDiagMetrics(acc telegraf.Accumulator, diag *goslurm.V0038DiagStatistics) {
records := make(map[string]interface{}, 13)
tags := map[string]string{"source": s.baseURL.Hostname()}
if int32Ptr, ok := diag.GetServerThreadCountOk(); ok {
records["server_thread_count"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsCanceledOk(); ok {
records["jobs_canceled"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsSubmittedOk(); ok {
records["jobs_submitted"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsStartedOk(); ok {
records["jobs_started"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsCompletedOk(); ok {
records["jobs_completed"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsFailedOk(); ok {
records["jobs_failed"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsPendingOk(); ok {
records["jobs_pending"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsRunningOk(); ok {
records["jobs_running"] = *int32Ptr
}
if int32Ptr, ok := diag.GetScheduleCycleLastOk(); ok {
records["schedule_cycle_last"] = *int32Ptr
}
if int32Ptr, ok := diag.GetScheduleCycleMeanOk(); ok {
records["schedule_cycle_mean"] = *int32Ptr
}
if int32Ptr, ok := diag.GetBfQueueLenOk(); ok {
records["bf_queue_len"] = *int32Ptr
}
if int32Ptr, ok := diag.GetBfQueueLenMeanOk(); ok {
records["bf_queue_len_mean"] = *int32Ptr
}
if boolPtr, ok := diag.GetBfActiveOk(); ok {
records["bf_active"] = *boolPtr
}
acc.AddFields("slurm_diag", records, tags)
}
func (s *Slurm) gatherJobsMetrics(acc telegraf.Accumulator, jobs []goslurm.V0038JobResponseProperties) {
for i := range jobs {
records := make(map[string]interface{}, 19)
tags := make(map[string]string, 3)
tags["source"] = s.baseURL.Hostname()
if strPtr, ok := jobs[i].GetNameOk(); ok {
tags["name"] = *strPtr
}
if int32Ptr, ok := jobs[i].GetJobIdOk(); ok {
tags["job_id"] = strconv.Itoa(int(*int32Ptr))
}
if strPtr, ok := jobs[i].GetJobStateOk(); ok {
records["state"] = *strPtr
}
if strPtr, ok := jobs[i].GetStateReasonOk(); ok {
records["state_reason"] = *strPtr
}
if strPtr, ok := jobs[i].GetPartitionOk(); ok {
records["partition"] = *strPtr
}
if strPtr, ok := jobs[i].GetNodesOk(); ok {
records["nodes"] = *strPtr
}
if int32Ptr, ok := jobs[i].GetNodeCountOk(); ok {
records["node_count"] = *int32Ptr
}
if int64Ptr, ok := jobs[i].GetPriorityOk(); ok {
records["priority"] = *int64Ptr
}
if int32Ptr, ok := jobs[i].GetNiceOk(); ok {
records["nice"] = *int32Ptr
}
if int32Ptr, ok := jobs[i].GetGroupIdOk(); ok {
records["group_id"] = *int32Ptr
}
if strPtr, ok := jobs[i].GetCommandOk(); ok {
records["command"] = *strPtr
}
if strPtr, ok := jobs[i].GetStandardOutputOk(); ok {
records["standard_output"] = strings.ReplaceAll(*strPtr, "\\", "")
}
if strPtr, ok := jobs[i].GetStandardErrorOk(); ok {
records["standard_error"] = strings.ReplaceAll(*strPtr, "\\", "")
}
if strPtr, ok := jobs[i].GetStandardInputOk(); ok {
records["standard_input"] = strings.ReplaceAll(*strPtr, "\\", "")
}
if strPtr, ok := jobs[i].GetCurrentWorkingDirectoryOk(); ok {
records["current_working_directory"] = strings.ReplaceAll(*strPtr, "\\", "")
}
if int64Ptr, ok := jobs[i].GetSubmitTimeOk(); ok {
records["submit_time"] = *int64Ptr
}
if int64Ptr, ok := jobs[i].GetStartTimeOk(); ok {
records["start_time"] = *int64Ptr
}
if int32Ptr, ok := jobs[i].GetCpusOk(); ok {
records["cpus"] = *int32Ptr
}
if int32Ptr, ok := jobs[i].GetTasksOk(); ok {
records["tasks"] = *int32Ptr
}
if int64Ptr, ok := jobs[i].GetTimeLimitOk(); ok {
records["time_limit"] = *int64Ptr
}
if strPtr, ok := jobs[i].GetTresReqStrOk(); ok {
for k, v := range s.parseTres(*strPtr) {
records["tres_"+k] = v
}
}
acc.AddFields("slurm_jobs", records, tags)
}
}
func (s *Slurm) gatherNodesMetrics(acc telegraf.Accumulator, nodes []goslurm.V0038Node) {
for _, node := range nodes {
records := make(map[string]interface{}, 13)
tags := make(map[string]string, 2)
tags["source"] = s.baseURL.Hostname()
if strPtr, ok := node.GetNameOk(); ok {
tags["name"] = *strPtr
}
if strPtr, ok := node.GetStateOk(); ok {
records["state"] = *strPtr
}
if int32Ptr, ok := node.GetCoresOk(); ok {
records["cores"] = *int32Ptr
}
if int32Ptr, ok := node.GetCpusOk(); ok {
records["cpus"] = *int32Ptr
}
if int64Ptr, ok := node.GetCpuLoadOk(); ok {
records["cpu_load"] = *int64Ptr
}
if int64Ptr, ok := node.GetAllocCpusOk(); ok {
records["alloc_cpu"] = *int64Ptr
}
if int32Ptr, ok := node.GetRealMemoryOk(); ok {
records["real_memory"] = *int32Ptr
}
if int32Ptr, ok := node.GetFreeMemoryOk(); ok {
records["free_memory"] = *int32Ptr
}
if int64Ptr, ok := node.GetAllocMemoryOk(); ok {
records["alloc_memory"] = *int64Ptr
}
if strPtr, ok := node.GetTresOk(); ok {
for k, v := range s.parseTres(*strPtr) {
records["tres_"+k] = v
}
}
if strPtr, ok := node.GetTresUsedOk(); ok {
for k, v := range s.parseTres(*strPtr) {
records["tres_used_"+k] = v
}
}
if int32Ptr, ok := node.GetWeightOk(); ok {
records["weight"] = *int32Ptr
}
if strPtr, ok := node.GetSlurmdVersionOk(); ok {
records["slurmd_version"] = *strPtr
}
if strPtr, ok := node.GetArchitectureOk(); ok {
records["architecture"] = *strPtr
}
acc.AddFields("slurm_nodes", records, tags)
}
}
func (s *Slurm) gatherPartitionsMetrics(acc telegraf.Accumulator, partitions []goslurm.V0038Partition) {
for _, partition := range partitions {
records := make(map[string]interface{}, 5)
tags := make(map[string]string, 2)
tags["source"] = s.baseURL.Hostname()
if strPtr, ok := partition.GetNameOk(); ok {
tags["name"] = *strPtr
}
if strPtr, ok := partition.GetStateOk(); ok {
records["state"] = *strPtr
}
if int32Ptr, ok := partition.GetTotalCpusOk(); ok {
records["total_cpu"] = *int32Ptr
}
if int32Ptr, ok := partition.GetTotalNodesOk(); ok {
records["total_nodes"] = *int32Ptr
}
if strPtr, ok := partition.GetNodesOk(); ok {
records["nodes"] = *strPtr
}
if strPtr, ok := partition.GetTresOk(); ok {
for k, v := range s.parseTres(*strPtr) {
records["tres_"+k] = v
}
}
acc.AddFields("slurm_partitions", records, tags)
}
}
func (s *Slurm) gatherReservationsMetrics(acc telegraf.Accumulator, reservations []goslurm.V0038Reservation) {
for _, reservation := range reservations {
records := make(map[string]interface{}, 9)
tags := make(map[string]string, 2)
tags["source"] = s.baseURL.Hostname()
if strPtr, ok := reservation.GetNameOk(); ok {
tags["name"] = *strPtr
}
if int32Ptr, ok := reservation.GetCoreCountOk(); ok {
records["core_count"] = *int32Ptr
}
if int32Ptr, ok := reservation.GetCoreSpecCntOk(); ok {
records["core_spec_count"] = *int32Ptr
}
if strPtr, ok := reservation.GetGroupsOk(); ok {
records["groups"] = *strPtr
}
if strPtr, ok := reservation.GetUsersOk(); ok {
records["users"] = *strPtr
}
if int32Ptr, ok := reservation.GetStartTimeOk(); ok {
records["start_time"] = *int32Ptr
}
if strPtr, ok := reservation.GetPartitionOk(); ok {
records["partition"] = *strPtr
}
if strPtr, ok := reservation.GetAccountsOk(); ok {
records["accounts"] = *strPtr
}
if int32Ptr, ok := reservation.GetNodeCountOk(); ok {
records["node_count"] = *int32Ptr
}
if strPtr, ok := reservation.GetNodeListOk(); ok {
records["node_list"] = *strPtr
}
acc.AddFields("slurm_reservations", records, tags)
}
}
func (s *Slurm) Gather(acc telegraf.Accumulator) (err error) {
auth := context.WithValue(
context.Background(),
goslurm.ContextAPIKeys,
map[string]goslurm.APIKey{
"user": {Key: s.Username},
"token": {Key: s.Token},
},
)
if s.endpointMap["diag"] {
diagResp, respRaw, err := s.client.SlurmAPI.SlurmV0038Diag(auth).Execute()
if err != nil {
return fmt.Errorf("error getting diag: %w", err)
}
if diag, ok := diagResp.GetStatisticsOk(); ok {
s.gatherDiagMetrics(acc, diag)
}
respRaw.Body.Close()
}
if s.endpointMap["jobs"] {
jobsResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetJobs(auth).Execute()
if err != nil {
return fmt.Errorf("error getting jobs: %w", err)
}
if jobs, ok := jobsResp.GetJobsOk(); ok {
s.gatherJobsMetrics(acc, jobs)
}
respRaw.Body.Close()
}
if s.endpointMap["nodes"] {
nodesResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetNodes(auth).Execute()
if err != nil {
return fmt.Errorf("error getting nodes: %w", err)
}
if nodes, ok := nodesResp.GetNodesOk(); ok {
s.gatherNodesMetrics(acc, nodes)
}
respRaw.Body.Close()
}
if s.endpointMap["partitions"] {
partitionsResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetPartitions(auth).Execute()
if err != nil {
return fmt.Errorf("error getting partitions: %w", err)
}
if partitions, ok := partitionsResp.GetPartitionsOk(); ok {
s.gatherPartitionsMetrics(acc, partitions)
}
respRaw.Body.Close()
}
if s.endpointMap["reservations"] {
reservationsResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetReservations(auth).Execute()
if err != nil {
return fmt.Errorf("error getting reservations: %w", err)
}
if reservations, ok := reservationsResp.GetReservationsOk(); ok {
s.gatherReservationsMetrics(acc, reservations)
}
respRaw.Body.Close()
}
return nil
}
func init() {
inputs.Add("slurm", func() telegraf.Input {
return &Slurm{
ResponseTimeout: config.Duration(5 * time.Second),
}
})
}

View File

@ -0,0 +1,152 @@
package slurm
import (
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/plugins/parsers/influx"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/require"
)
func TestGoodURLs(t *testing.T) {
tests := []struct {
name string
url string
}{
{"http", "http://example.com:6820"},
{"https", "https://example.com:6820"},
{"http no port", "http://example.com"},
{"https no port", "https://example.com"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
plugin := Slurm{
URL: tt.url,
}
require.NoError(t, plugin.Init())
})
}
}
func TestWrongURLs(t *testing.T) {
tests := []struct {
name string
url string
}{
{"wrong http scheme", "httpp://example.com:6820"},
{"wrong https scheme", "httpss://example.com:6820"},
{"empty url", ""},
{"empty hostname", "http://:6820"},
{"only scheme", "http://"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
plugin := Slurm{
URL: tt.url,
}
require.Error(t, plugin.Init())
})
}
}
func TestWrongEndpoints(t *testing.T) {
tests := []struct {
name string
enabledEndpoints []string
}{
{"empty endpoint", []string{"diag", "", "jobs"}},
{"mistyped endpoint", []string{"diagg", "jobs", "partitions"}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
plugin := Slurm{
URL: "http://example.net",
EnabledEndpoints: tt.enabledEndpoints,
}
require.Error(t, plugin.Init())
})
}
}
func TestCases(t *testing.T) {
entries, err := os.ReadDir("testcases")
require.NoError(t, err)
for _, entry := range entries {
if !entry.IsDir() {
continue
}
t.Run(entry.Name(), func(t *testing.T) {
testcasePath := filepath.Join("testcases", entry.Name())
responsesPath := filepath.Join(testcasePath, "responses")
expectedFilename := filepath.Join(testcasePath, "expected.out")
configFilename := filepath.Join(testcasePath, "telegraf.conf")
responses, err := os.ReadDir(responsesPath)
require.NoError(t, err)
pathToResponse := map[string][]byte{}
for _, response := range responses {
if response.IsDir() {
continue
}
fName := response.Name()
buf, err := os.ReadFile(filepath.Join(responsesPath, fName))
require.NoError(t, err)
pathToResponse[strings.TrimSuffix(fName, filepath.Ext(fName))] = buf
}
// Prepare the influx parser for expectations
parser := &influx.Parser{}
require.NoError(t, parser.Init())
// Read expected values, if any
var expected []telegraf.Metric
if _, err := os.Stat(expectedFilename); err == nil {
var err error
expected, err = testutil.ParseMetricsFromFile(expectedFilename, parser)
require.NoError(t, err)
}
ts := httptest.NewServer(http.NotFoundHandler())
defer ts.Close()
ts.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp, ok := pathToResponse[strings.TrimPrefix(r.URL.Path, "/slurm/v0.0.38/")]
require.True(t, ok)
w.Header().Add("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_, err := w.Write(resp)
require.NoError(t, err)
})
// Load the test-specific configuration
cfg := config.NewConfig()
cfg.Agent.Quiet = true
require.NoError(t, cfg.LoadConfig(configFilename))
require.Len(t, cfg.Inputs, 1)
// Instantiate the plugin. As seen on NewConfig's documentation,
// parsing the configuration will instantiate the plugins, so that
// we only need to assert the plugin's type!
plugin := cfg.Inputs[0].Input.(*Slurm)
plugin.URL = "http://" + ts.Listener.Addr().String()
plugin.Log = testutil.Logger{}
require.NoError(t, plugin.Init())
var acc testutil.Accumulator
require.NoError(t, plugin.Gather(&acc))
actual := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, expected, actual, testutil.SortMetrics(), testutil.IgnoreTime())
})
}
}

View File

@ -0,0 +1,11 @@
slurm_diag,source=127.0.0.1 bf_active=false,bf_queue_len=1i,bf_queue_len_mean=1i,jobs_canceled=0i,jobs_completed=287i,jobs_failed=1i,jobs_pending=0i,jobs_running=100i,jobs_started=287i,jobs_submitted=287i,schedule_cycle_last=298i,schedule_cycle_mean=137i,server_thread_count=3i 1723464650000000000
slurm_jobs,job_id=20464,name=gridjob,source=127.0.0.1 command="/tmp/SLURM_job_script.OjQEIH",cpus=2i,current_working_directory="/home/sessiondir/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm",group_id=2005i,nice=50i,node_count=1i,nodes="naboo222",partition="atlas",priority=4294881265i,standard_error="/home/sessiondir/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm.comment",standard_input="/dev/null",standard_output="/home/sessiondir/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm.comment",start_time=1722989851i,state="RUNNING",state_reason="None",submit_time=1722989851i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=2000,tres_node=1 1723464650000000000
slurm_jobs,job_id=20468,name=gridjob,source=127.0.0.1 command="/tmp/SLURM_job_script.XTwtdj",cpus=2i,current_working_directory="/home/sessiondir/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n",group_id=2005i,nice=50i,node_count=1i,nodes="naboo222",partition="atlas",priority=4294881261i,standard_error="/home/sessiondir/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n.comment",standard_input="/dev/null",standard_output="/home/sessiondir/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n.comment",start_time=1722990772i,state="RUNNING",state_reason="None",submit_time=1722990772i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=2000,tres_node=1 1723464650000000000
slurm_jobs,job_id=23772,name=gridjob,source=127.0.0.1 command="/tmp/SLURM_job_script.8PMmVe",cpus=8i,current_working_directory="/home/sessiondir/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm",group_id=2005i,nice=50i,node_count=1i,nodes="naboo147",partition="atlas",priority=4294877957i,standard_error="/home/sessiondir/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm.comment",standard_input="/dev/null",standard_output="/home/sessiondir/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm.comment",start_time=1723457333i,state="COMPLETED",state_reason="None",submit_time=1723457333i,tasks=8i,time_limit=3600i,tres_billing=8,tres_cpu=8,tres_mem=16000,tres_node=1 1723464650000000000
slurm_nodes,name=naboo145,source=127.0.0.1 alloc_cpu=0i,alloc_memory=0i,architecture="x86_64",cores=18i,cpu_load=27i,cpus=36i,free_memory=86423i,real_memory=94791i,slurmd_version="22.05.9",state="idle",tres_billing=36,tres_cpu=36,tres_mem=94791,weight=1i 1723464650000000000
slurm_nodes,name=naboo146,source=127.0.0.1 alloc_cpu=0i,alloc_memory=0i,architecture="x86_64",cores=18i,cpu_load=0i,cpus=36i,free_memory=92151i,real_memory=94791i,slurmd_version="22.05.9",state="idle",tres_billing=36,tres_cpu=36,tres_mem=94791,weight=1i 1723464650000000000
slurm_nodes,name=naboo147,source=127.0.0.1 alloc_cpu=36i,alloc_memory=56000i,architecture="x86_64",cores=18i,cpu_load=2969i,cpus=36i,free_memory=10908i,real_memory=94793i,slurmd_version="22.05.9",state="allocated",tres_billing=36,tres_cpu=36,tres_mem=94793,tres_used_cpu=36,tres_used_mem=56000,weight=1i 1723464650000000000
slurm_partitions,name=atlas,source=127.0.0.1 nodes="naboo145,naboo146,naboo147,naboo216,naboo219,naboo222,naboo224,naboo225,naboo227,naboo228,naboo229,naboo234,naboo235,naboo236,naboo237,naboo238,naboo239,naboo240,naboo241,naboo242,naboo243",state="UP",total_cpu=632i,total_nodes=21i,tres_billing=632,tres_cpu=632,tres_mem=1415207,tres_node=21 1723464650000000000

View File

@ -0,0 +1,224 @@
{
"meta": {
"plugin": {
"type": "openapi\/v0.0.38",
"name": "Slurm OpenAPI v0.0.38"
},
"Slurm": {
"version": {
"major": 22,
"micro": 9,
"minor": 5
},
"release": "22.05.9"
}
},
"errors": [
],
"statistics": {
"rpcs_by_message_type": [
{
"message_type": "REQUEST_JOB_INFO",
"type_id": 2003,
"count": 73587,
"average_time": 658,
"total_time": 48479000
},
{
"message_type": "REQUEST_PARTITION_INFO",
"type_id": 2009,
"count": 158967,
"average_time": 101,
"total_time": 16185440
},
{
"message_type": "MESSAGE_NODE_REGISTRATION_STATUS",
"type_id": 1002,
"count": 18690,
"average_time": 137,
"total_time": 2566758
},
{
"message_type": "REQUEST_COMPLETE_BATCH_SCRIPT",
"type_id": 5018,
"count": 12233,
"average_time": 486,
"total_time": 5946490
},
{
"message_type": "REQUEST_AUTH_TOKEN",
"type_id": 5039,
"count": 36,
"average_time": 291,
"total_time": 10489
},
{
"message_type": "REQUEST_BUILD_INFO",
"type_id": 2001,
"count": 28201,
"average_time": 194,
"total_time": 5486061
},
{
"message_type": "REQUEST_PING",
"type_id": 1008,
"count": 28201,
"average_time": 103,
"total_time": 2925195
},
{
"message_type": "REQUEST_NODE_INFO",
"type_id": 2007,
"count": 85379,
"average_time": 175,
"total_time": 15007960
},
{
"message_type": "REQUEST_FED_INFO",
"type_id": 2049,
"count": 24466,
"average_time": 109,
"total_time": 2681655
},
{
"message_type": "REQUEST_JOB_INFO_SINGLE",
"type_id": 2021,
"count": 24466,
"average_time": 121,
"total_time": 2963320
},
{
"message_type": "REQUEST_SUBMIT_BATCH_JOB",
"type_id": 4003,
"count": 12233,
"average_time": 6504,
"total_time": 79574600
},
{
"message_type": "REQUEST_STATS_INFO",
"type_id": 2035,
"count": 1040,
"average_time": 61,
"total_time": 64431
},
{
"message_type": "MESSAGE_EPILOG_COMPLETE",
"type_id": 6012,
"count": 40,
"average_time": 86,
"total_time": 3455
},
{
"message_type": "REQUEST_RESERVATION_INFO",
"type_id": 2024,
"count": 1017,
"average_time": 47,
"total_time": 48788
},
{
"message_type": "REQUEST_LICENSE_INFO",
"type_id": 1021,
"count": 42,
"average_time": 43,
"total_time": 1823
},
{
"message_type": "REQUEST_UPDATE_NODE",
"type_id": 3002,
"count": 2,
"average_time": 415,
"total_time": 830
}
],
"rpcs_by_user": [
{
"user": "root",
"user_id": 0,
"count": 456365,
"average_time": 224,
"total_time": 102371523
},
{
"user": "atl001",
"user_id": 2006,
"count": 11699,
"average_time": 6611,
"total_time": 77353396
},
{
"user": "atl002",
"user_id": 2007,
"count": 120,
"average_time": 3684,
"total_time": 442106
},
{
"user": "ops001",
"user_id": 18006,
"count": 298,
"average_time": 4447,
"total_time": 1325496
},
{
"user": "ops003",
"user_id": 18008,
"count": 58,
"average_time": 3732,
"total_time": 216488
},
{
"user": "ops002",
"user_id": 18007,
"count": 58,
"average_time": 4088,
"total_time": 237114
},
{
"user": "99",
"user_id": 99,
"count": 2,
"average_time": 86,
"total_time": 172
}
],
"parts_packed": 1,
"req_time": 1723103198,
"req_time_start": 1723075200,
"server_thread_count": 3,
"agent_queue_size": 0,
"agent_count": 0,
"agent_thread_count": 0,
"dbd_agent_queue_size": 0,
"gettimeofday_latency": 21,
"schedule_cycle_max": 1116,
"schedule_cycle_last": 298,
"schedule_cycle_total": 960,
"schedule_cycle_mean": 137,
"schedule_cycle_mean_depth": 0,
"schedule_cycle_per_minute": 2,
"schedule_queue_length": 1,
"jobs_submitted": 287,
"jobs_started": 287,
"jobs_completed": 287,
"jobs_canceled": 0,
"jobs_failed": 1,
"jobs_pending": 0,
"jobs_running": 100,
"job_states_ts": 1723103172,
"bf_backfilled_jobs": 1626,
"bf_last_backfilled_jobs": 14,
"bf_backfilled_het_jobs": 0,
"bf_cycle_counter": 12,
"bf_cycle_mean": 440,
"bf_depth_mean": 1,
"bf_depth_mean_try": 1,
"bf_cycle_last": 387,
"bf_cycle_max": 811,
"bf_queue_len": 1,
"bf_queue_len_mean": 1,
"bf_table_size": 1,
"bf_table_size_mean": 1,
"bf_when_last_cycle": 1723102514,
"bf_active": false
}
}

View File

@ -0,0 +1,448 @@
{
"meta": {
"plugin": {
"type": "openapi\/v0.0.38",
"name": "Slurm OpenAPI v0.0.38"
},
"Slurm": {
"version": {
"major": 22,
"micro": 9,
"minor": 5
},
"release": "22.05.9"
}
},
"errors": [
],
"jobs": [
{
"account": "",
"accrue_time": 1722989851,
"admin_comment": "",
"array_job_id": 0,
"array_task_id": null,
"array_max_tasks": 0,
"array_task_string": "",
"association_id": 0,
"batch_features": "",
"batch_flag": true,
"batch_host": "naboo222",
"flags": [
"JOB_WAS_RUNNING",
"JOB_MEM_SET"
],
"burst_buffer": "",
"burst_buffer_state": "",
"cluster": "local",
"cluster_features": "",
"command": "\/tmp\/SLURM_job_script.OjQEIH",
"comment": "",
"container": "",
"contiguous": false,
"core_spec": null,
"thread_spec": null,
"cores_per_socket": null,
"billable_tres": 2.0,
"cpus_per_task": null,
"cpu_frequency_minimum": null,
"cpu_frequency_maximum": null,
"cpu_frequency_governor": null,
"cpus_per_tres": "",
"deadline": 0,
"delay_boot": 0,
"dependency": "",
"derived_exit_code": 0,
"eligible_time": 1722989851,
"end_time": 1723205851,
"excluded_nodes": "",
"exit_code": 0,
"features": "",
"federation_origin": "",
"federation_siblings_active": "",
"federation_siblings_viable": "",
"gres_detail": [
],
"group_id": 2005,
"group_name": "atlas",
"job_id": 20464,
"job_resources": {
"nodes": "naboo222",
"allocated_hosts": 1,
"allocated_nodes": [
{
"sockets": {
"0": {
"cores": {
"0": "allocated"
}
}
},
"nodename": "naboo222",
"cpus_used": 0,
"memory_used": 0,
"memory_allocated": 4000
}
]
},
"job_state": "RUNNING",
"last_sched_evaluation": 1722989851,
"licenses": "",
"max_cpus": 0,
"max_nodes": 0,
"mcs_label": "",
"memory_per_tres": "",
"name": "gridjob",
"nodes": "naboo222",
"nice": 50,
"tasks_per_core": null,
"tasks_per_node": 0,
"tasks_per_socket": null,
"tasks_per_board": 0,
"cpus": 2,
"node_count": 1,
"tasks": 1,
"het_job_id": 0,
"het_job_id_set": "",
"het_job_offset": 0,
"partition": "atlas",
"prefer": "",
"memory_per_node": null,
"memory_per_cpu": 2000,
"minimum_cpus_per_node": 1,
"minimum_tmp_disk_per_node": 0,
"preempt_time": 0,
"pre_sus_time": 0,
"priority": 4294881265,
"profile": null,
"qos": "",
"reboot": false,
"required_nodes": "",
"requeue": false,
"resize_time": 0,
"restart_cnt": 0,
"resv_name": "",
"shared": null,
"show_flags": [
"SHOW_ALL",
"SHOW_DETAIL",
"SHOW_LOCAL"
],
"sockets_per_board": 0,
"sockets_per_node": null,
"start_time": 1722989851,
"state_description": "",
"state_reason": "None",
"standard_error": "\/home\/sessiondir\/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm.comment",
"standard_input": "\/dev\/null",
"standard_output": "\/home\/sessiondir\/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm.comment",
"submit_time": 1722989851,
"suspend_time": 0,
"system_comment": "",
"time_limit": 3600,
"time_minimum": 0,
"threads_per_core": null,
"tres_bind": "",
"tres_freq": "",
"tres_per_job": "",
"tres_per_node": "",
"tres_per_socket": "",
"tres_per_task": "",
"tres_req_str": "cpu=1,mem=2000M,node=1,billing=1",
"tres_alloc_str": "cpu=2,mem=4000M,node=1,billing=2",
"user_id": 2006,
"user_name": "atl001",
"wckey": "",
"current_working_directory": "\/home\/sessiondir\/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm"
},
{
"account": "",
"accrue_time": 1722990772,
"admin_comment": "",
"array_job_id": 0,
"array_task_id": null,
"array_max_tasks": 0,
"array_task_string": "",
"association_id": 0,
"batch_features": "",
"batch_flag": true,
"batch_host": "naboo222",
"flags": [
"JOB_WAS_RUNNING",
"JOB_MEM_SET"
],
"burst_buffer": "",
"burst_buffer_state": "",
"cluster": "local",
"cluster_features": "",
"command": "\/tmp\/SLURM_job_script.XTwtdj",
"comment": "",
"container": "",
"contiguous": false,
"core_spec": null,
"thread_spec": null,
"cores_per_socket": null,
"billable_tres": 2.0,
"cpus_per_task": null,
"cpu_frequency_minimum": null,
"cpu_frequency_maximum": null,
"cpu_frequency_governor": null,
"cpus_per_tres": "",
"deadline": 0,
"delay_boot": 0,
"dependency": "",
"derived_exit_code": 0,
"eligible_time": 1722990772,
"end_time": 1723206772,
"excluded_nodes": "",
"exit_code": 0,
"features": "",
"federation_origin": "",
"federation_siblings_active": "",
"federation_siblings_viable": "",
"gres_detail": [
],
"group_id": 2005,
"group_name": "atlas",
"job_id": 20468,
"job_resources": {
"nodes": "naboo222",
"allocated_hosts": 1,
"allocated_nodes": [
{
"sockets": {
"1": {
"cores": {
"2": "allocated"
}
}
},
"nodename": "naboo222",
"cpus_used": 0,
"memory_used": 0,
"memory_allocated": 4000
}
]
},
"job_state": "RUNNING",
"last_sched_evaluation": 1722990772,
"licenses": "",
"max_cpus": 0,
"max_nodes": 0,
"mcs_label": "",
"memory_per_tres": "",
"name": "gridjob",
"nodes": "naboo222",
"nice": 50,
"tasks_per_core": null,
"tasks_per_node": 0,
"tasks_per_socket": null,
"tasks_per_board": 0,
"cpus": 2,
"node_count": 1,
"tasks": 1,
"het_job_id": 0,
"het_job_id_set": "",
"het_job_offset": 0,
"partition": "atlas",
"prefer": "",
"memory_per_node": null,
"memory_per_cpu": 2000,
"minimum_cpus_per_node": 1,
"minimum_tmp_disk_per_node": 0,
"preempt_time": 0,
"pre_sus_time": 0,
"priority": 4294881261,
"profile": null,
"qos": "",
"reboot": false,
"required_nodes": "",
"requeue": false,
"resize_time": 0,
"restart_cnt": 0,
"resv_name": "",
"shared": null,
"show_flags": [
"SHOW_ALL",
"SHOW_DETAIL",
"SHOW_LOCAL"
],
"sockets_per_board": 0,
"sockets_per_node": null,
"start_time": 1722990772,
"state_description": "",
"state_reason": "None",
"standard_error": "\/home\/sessiondir\/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n.comment",
"standard_input": "\/dev\/null",
"standard_output": "\/home\/sessiondir\/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n.comment",
"submit_time": 1722990772,
"suspend_time": 0,
"system_comment": "",
"time_limit": 3600,
"time_minimum": 0,
"threads_per_core": null,
"tres_bind": "",
"tres_freq": "",
"tres_per_job": "",
"tres_per_node": "",
"tres_per_socket": "",
"tres_per_task": "",
"tres_req_str": "cpu=1,mem=2000M,node=1,billing=1",
"tres_alloc_str": "cpu=2,mem=4000M,node=1,billing=2",
"user_id": 2006,
"user_name": "atl001",
"wckey": "",
"current_working_directory": "\/home\/sessiondir\/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n"
},
{
"account": "",
"accrue_time": 1723457333,
"admin_comment": "",
"array_job_id": 0,
"array_task_id": null,
"array_max_tasks": 0,
"array_task_string": "",
"association_id": 0,
"batch_features": "",
"batch_flag": true,
"batch_host": "naboo147",
"flags": [
"TRES_STR_CALC",
"JOB_MEM_SET"
],
"burst_buffer": "",
"burst_buffer_state": "",
"cluster": "local",
"cluster_features": "",
"command": "\/tmp\/SLURM_job_script.8PMmVe",
"comment": "",
"container": "",
"contiguous": false,
"core_spec": null,
"thread_spec": null,
"cores_per_socket": null,
"billable_tres": 8.0,
"cpus_per_task": null,
"cpu_frequency_minimum": null,
"cpu_frequency_maximum": null,
"cpu_frequency_governor": null,
"cpus_per_tres": "",
"deadline": 0,
"delay_boot": 0,
"dependency": "",
"derived_exit_code": 0,
"eligible_time": 1723457333,
"end_time": 1723463525,
"excluded_nodes": "",
"exit_code": 0,
"features": "",
"federation_origin": "",
"federation_siblings_active": "",
"federation_siblings_viable": "",
"gres_detail": [
],
"group_id": 2005,
"group_name": "atlas",
"job_id": 23772,
"job_resources": {
"nodes": "naboo147",
"allocated_hosts": 1,
"allocated_nodes": [
{
"sockets": {
"0": {
"cores": {
"3": "allocated",
"10": "allocated",
"12": "allocated",
"13": "allocated"
}
},
"1": {
"cores": {
"8": "allocated",
"11": "allocated",
"12": "allocated",
"13": "allocated"
}
}
},
"nodename": "naboo147",
"cpus_used": 0,
"memory_used": 0,
"memory_allocated": 16000
}
]
},
"job_state": "COMPLETED",
"last_sched_evaluation": 1723457333,
"licenses": "",
"max_cpus": 0,
"max_nodes": 0,
"mcs_label": "",
"memory_per_tres": "",
"name": "gridjob",
"nodes": "naboo147",
"nice": 50,
"tasks_per_core": null,
"tasks_per_node": 8,
"tasks_per_socket": null,
"tasks_per_board": 0,
"cpus": 8,
"node_count": 1,
"tasks": 8,
"het_job_id": 0,
"het_job_id_set": "",
"het_job_offset": 0,
"partition": "atlas",
"prefer": "",
"memory_per_node": null,
"memory_per_cpu": 2000,
"minimum_cpus_per_node": 8,
"minimum_tmp_disk_per_node": 0,
"preempt_time": 0,
"pre_sus_time": 0,
"priority": 4294877957,
"profile": null,
"qos": "",
"reboot": false,
"required_nodes": "",
"requeue": false,
"resize_time": 0,
"restart_cnt": 0,
"resv_name": "",
"shared": null,
"show_flags": [
"SHOW_ALL",
"SHOW_DETAIL",
"SHOW_LOCAL"
],
"sockets_per_board": 0,
"sockets_per_node": null,
"start_time": 1723457333,
"state_description": "",
"state_reason": "None",
"standard_error": "\/home\/sessiondir\/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm.comment",
"standard_input": "\/dev\/null",
"standard_output": "\/home\/sessiondir\/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm.comment",
"submit_time": 1723457333,
"suspend_time": 0,
"system_comment": "",
"time_limit": 3600,
"time_minimum": 0,
"threads_per_core": null,
"tres_bind": "",
"tres_freq": "",
"tres_per_job": "",
"tres_per_node": "",
"tres_per_socket": "",
"tres_per_task": "",
"tres_req_str": "cpu=8,mem=16000M,node=1,billing=8",
"tres_alloc_str": "cpu=8,mem=16000M,node=1,billing=8",
"user_id": 2006,
"user_name": "atl001",
"wckey": "",
"current_working_directory": "\/home\/sessiondir\/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm"
}
]
}

View File

@ -0,0 +1,175 @@
{
"meta": {
"plugin": {
"type": "openapi\/v0.0.38",
"name": "Slurm OpenAPI v0.0.38"
},
"Slurm": {
"version": {
"major": 22,
"micro": 9,
"minor": 5
},
"release": "22.05.9"
}
},
"errors": [
],
"nodes": [
{
"architecture": "x86_64",
"burstbuffer_network_address": "",
"boards": 1,
"boot_time": 1719400973,
"comment": "",
"cores": 18,
"cpu_binding": 0,
"cpu_load": 27,
"extra": "",
"free_memory": 86423,
"cpus": 36,
"last_busy": 1723102876,
"features": "",
"active_features": "",
"gres": "",
"gres_drained": "N\/A",
"gres_used": "",
"mcs_label": "",
"name": "naboo145",
"next_state_after_reboot": "invalid",
"address": "naboo145",
"hostname": "naboo145",
"state": "idle",
"state_flags": [
"DRAIN"
],
"next_state_after_reboot_flags": [
],
"operating_system": "Linux 5.14.0-427.13.1.el9_4.x86_64 #1 SMP PREEMPT_DYNAMIC Tue Apr 30 18:22:29 EDT 2024",
"owner": null,
"partitions": [
"atlas"
],
"port": 6818,
"real_memory": 94791,
"reason": "Kill task failed",
"reason_changed_at": 1723077306,
"reason_set_by_user": "root",
"slurmd_start_time": 1720394759,
"sockets": 2,
"threads": 1,
"temporary_disk": 0,
"weight": 1,
"tres": "cpu=36,mem=94791M,billing=36",
"slurmd_version": "22.05.9",
"alloc_memory": 0,
"alloc_cpus": 0,
"idle_cpus": 36,
"tres_used": null,
"tres_weighted": 0.0
},
{
"architecture": "x86_64",
"burstbuffer_network_address": "",
"boards": 1,
"boot_time": 1719400759,
"comment": "",
"cores": 18,
"cpu_binding": 0,
"cpu_load": 0,
"extra": "",
"free_memory": 92151,
"cpus": 36,
"last_busy": 1722780995,
"features": "",
"active_features": "",
"gres": "",
"gres_drained": "N\/A",
"gres_used": "",
"mcs_label": "",
"name": "naboo146",
"next_state_after_reboot": "invalid",
"address": "naboo146",
"hostname": "naboo146",
"state": "idle",
"state_flags": [
"DRAIN"
],
"next_state_after_reboot_flags": [
],
"operating_system": "Linux 5.14.0-427.13.1.el9_4.x86_64 #1 SMP PREEMPT_DYNAMIC Tue Apr 30 18:22:29 EDT 2024",
"owner": null,
"partitions": [
"atlas"
],
"port": 6818,
"real_memory": 94791,
"reason": "Kill task failed",
"reason_changed_at": 1722748927,
"reason_set_by_user": "root",
"slurmd_start_time": 1720394759,
"sockets": 2,
"threads": 1,
"temporary_disk": 0,
"weight": 1,
"tres": "cpu=36,mem=94791M,billing=36",
"slurmd_version": "22.05.9",
"alloc_memory": 0,
"alloc_cpus": 0,
"idle_cpus": 36,
"tres_used": null,
"tres_weighted": 0.0
},
{
"architecture": "x86_64",
"burstbuffer_network_address": "",
"boards": 1,
"boot_time": 1719406605,
"comment": "",
"cores": 18,
"cpu_binding": 0,
"cpu_load": 2969,
"extra": "",
"free_memory": 10908,
"cpus": 36,
"last_busy": 1722881704,
"features": "",
"active_features": "",
"gres": "",
"gres_drained": "N\/A",
"gres_used": "",
"mcs_label": "",
"name": "naboo147",
"next_state_after_reboot": "invalid",
"address": "naboo147",
"hostname": "naboo147",
"state": "allocated",
"state_flags": [
],
"next_state_after_reboot_flags": [
],
"operating_system": "Linux 5.14.0-427.13.1.el9_4.x86_64 #1 SMP PREEMPT_DYNAMIC Tue Apr 30 18:22:29 EDT 2024",
"owner": null,
"partitions": [
"atlas"
],
"port": 6818,
"real_memory": 94793,
"reason": "",
"reason_changed_at": 0,
"reason_set_by_user": null,
"slurmd_start_time": 1720394759,
"sockets": 2,
"threads": 1,
"temporary_disk": 0,
"weight": 1,
"tres": "cpu=36,mem=94793M,billing=36",
"slurmd_version": "22.05.9",
"alloc_memory": 56000,
"alloc_cpus": 36,
"idle_cpus": 0,
"tres_used": "cpu=36,mem=56000M",
"tres_weighted": 36.0
}
]
}

View File

@ -0,0 +1,56 @@
{
"meta": {
"plugin": {
"type": "openapi\/v0.0.38",
"name": "Slurm OpenAPI v0.0.38"
},
"Slurm": {
"version": {
"major": 22,
"micro": 9,
"minor": 5
},
"release": "22.05.9"
}
},
"errors": [
],
"partitions": [
{
"flags": [
"default"
],
"preemption_mode": [
"disabled"
],
"allowed_allocation_nodes": "",
"allowed_accounts": "",
"allowed_groups": "",
"allowed_qos": "",
"alternative": "",
"billing_weights": "",
"default_memory_per_cpu": null,
"default_memory_per_node": null,
"default_time_limit": null,
"denied_accounts": "",
"denied_qos": "",
"preemption_grace_time": 0,
"maximum_cpus_per_node": -1,
"maximum_memory_per_cpu": null,
"maximum_memory_per_node": null,
"maximum_nodes_per_job": -1,
"max_time_limit": -1,
"min nodes per job": 0,
"name": "atlas",
"nodes": "naboo145,naboo146,naboo147,naboo216,naboo219,naboo222,naboo224,naboo225,naboo227,naboo228,naboo229,naboo234,naboo235,naboo236,naboo237,naboo238,naboo239,naboo240,naboo241,naboo242,naboo243",
"over_time_limit": null,
"priority_job_factor": 1,
"priority_tier": 1,
"qos": "",
"state": "UP",
"total_cpus": 632,
"total_nodes": 21,
"tres": "cpu=632,mem=1415207M,node=21,billing=632"
}
]
}

View File

@ -0,0 +1,20 @@
{
"meta": {
"plugin": {
"type": "openapi\/v0.0.38",
"name": "Slurm OpenAPI v0.0.38"
},
"Slurm": {
"version": {
"major": 22,
"micro": 9,
"minor": 5
},
"release": "22.05.9"
}
},
"errors": [
],
"reservations": [
]
}

View File

@ -0,0 +1,8 @@
[[inputs.slurm]]
url = "willBeOverriden"
response_timeout = "5s"
# enabled_endpoints = []
## Credentials for JWT-based authentication
username = "root"
token = "topSecret"

View File

@ -0,0 +1,5 @@
{
"meta": {},
"errors": [],
"statistics": {}
}

View File

@ -0,0 +1,5 @@
{
"meta": {},
"errors": [],
"jobs": []
}

View File

@ -0,0 +1,5 @@
{
"meta": {},
"errors": [],
"nodes": []
}

View File

@ -0,0 +1,5 @@
{
"meta": {},
"errors": [],
"partitions": []
}

View File

@ -0,0 +1,5 @@
{
"meta": {},
"errors": [],
"reservations": []
}

View File

@ -0,0 +1,8 @@
[[inputs.slurm]]
url = "willBeOverriden"
response_timeout = "5s"
enabled_endpoints = []
## Credentials for JWT-based authentication
username = "root"
token = "topSecret"