mnfst · TF0rd · Jun 6, 2026
diff --git a/models/nvidia/gliner-pii.yaml b/models/nvidia/gliner-pii.yaml
@@ -0,0 +1,43 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: gliner-pii
+params:
+  - path: labels
+    type: string
+    label: Labels
+    description: Entity types to detect. If not specified, uses the default set of 55 PII categories including email, phone_number, ssn, first_name, last_name, and address.
+    group: provider_metadata
+  - path: threshold
+    type: number
+    label: Threshold
+    description: Confidence threshold for entity detection. Lower values detect more entities but may include false positives.
+    default: 0.5
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: chunk_length
+    type: integer
+    label: Chunk length
+    description: Context window size for processing. Longer texts are automatically split into chunks with overlap for complete coverage. Must be greater than overlap.
+    default: 384
+    range:
+      min: 1
+      max: 2048
+    group: provider_metadata
+  - path: overlap
+    type: integer
+    label: Overlap
+    description: Token overlap between chunks to prevent entity clipping. Must be less than chunk_length.
+    default: 128
+    range:
+      min: 0
+      max: 512
+    group: provider_metadata
+  - path: flat_ner
+    type: boolean
+    label: Flat NER
+    description: When true, prevents overlapping entity spans. When false, may return nested entities such as both a full name and its constituent first name.
+    default: false
+    group: provider_metadata
diff --git a/models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml b/models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml
@@ -0,0 +1,53 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemoguard-8b-topic-control
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.5
+    range:
+      min: 0
+      max: 2
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 1024
+    range:
+      min: 1
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml b/models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemotron-nano-8b-v1
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 4096
+    range:
+      min: 1
+      max: 16384
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml b/models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml
@@ -0,0 +1,14 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemotron-safety-guard-8b-v3
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied.
+    default: 0
+    range:
+      min: 0
+      max: 1
+    group: sampling
diff --git a/models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml b/models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemotron-ultra-253b-v1
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 4096
+    range:
+      min: 1
+      max: 16384
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml b/models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.3-nemotron-super-49b-v1.5
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 65536
+    range:
+      min: 1
+      max: 65536
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml b/models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.3-nemotron-super-49b-v1
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 4096
+    range:
+      min: 1
+      max: 16384
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/nemoguard-jailbreak-detect.yaml b/models/nvidia/nemoguard-jailbreak-detect.yaml
@@ -0,0 +1,10 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemoguard-jailbreak-detect
+params:
+  - path: input
+    type: string
+    label: Input
+    description: The text to classify for jailbreak attempts. Accepts a string or an array of strings.
+    group: provider_metadata