diff --git a/models/nvidia/gliner-pii.yaml b/models/nvidia/gliner-pii.yaml
new file mode 100644
index 0000000..9bbb542
--- /dev/null
+++ b/models/nvidia/gliner-pii.yaml
@@ -0,0 +1,43 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: gliner-pii
+params:
+  - path: labels
+    type: string
+    label: Labels
+    description: Entity types to detect. If not specified, uses the default set of 55 PII categories including email, phone_number, ssn, first_name, last_name, and address.
+    group: provider_metadata
+  - path: threshold
+    type: number
+    label: Threshold
+    description: Confidence threshold for entity detection. Lower values detect more entities but may include false positives.
+    default: 0.5
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: chunk_length
+    type: integer
+    label: Chunk length
+    description: Context window size for processing. Longer texts are automatically split into chunks with overlap for complete coverage. Must be greater than overlap.
+    default: 384
+    range:
+      min: 1
+      max: 2048
+    group: provider_metadata
+  - path: overlap
+    type: integer
+    label: Overlap
+    description: Token overlap between chunks to prevent entity clipping. Must be less than chunk_length.
+    default: 128
+    range:
+      min: 0
+      max: 512
+    group: provider_metadata
+  - path: flat_ner
+    type: boolean
+    label: Flat NER
+    description: When true, prevents overlapping entity spans. When false, may return nested entities such as both a full name and its constituent first name.
+    default: false
+    group: provider_metadata
diff --git a/models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml b/models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml
new file mode 100644
index 0000000..bfc0d61
--- /dev/null
+++ b/models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml
@@ -0,0 +1,53 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemoguard-8b-topic-control
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.5
+    range:
+      min: 0
+      max: 2
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 1024
+    range:
+      min: 1
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml b/models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml
new file mode 100644
index 0000000..fc46239
--- /dev/null
+++ b/models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemotron-nano-8b-v1
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 4096
+    range:
+      min: 1
+      max: 16384
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml b/models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml
new file mode 100644
index 0000000..4256093
--- /dev/null
+++ b/models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml
@@ -0,0 +1,14 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemotron-safety-guard-8b-v3
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied.
+    default: 0
+    range:
+      min: 0
+      max: 1
+    group: sampling
diff --git a/models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml b/models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml
new file mode 100644
index 0000000..367b0ef
--- /dev/null
+++ b/models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemotron-ultra-253b-v1
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 4096
+    range:
+      min: 1
+      max: 16384
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml b/models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml
new file mode 100644
index 0000000..b20d19b
--- /dev/null
+++ b/models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.3-nemotron-super-49b-v1.5
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 65536
+    range:
+      min: 1
+      max: 65536
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml b/models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml
new file mode 100644
index 0000000..313d97e
--- /dev/null
+++ b/models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.3-nemotron-super-49b-v1
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 4096
+    range:
+      min: 1
+      max: 16384
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/nemoguard-jailbreak-detect.yaml b/models/nvidia/nemoguard-jailbreak-detect.yaml
new file mode 100644
index 0000000..1800f49
--- /dev/null
+++ b/models/nvidia/nemoguard-jailbreak-detect.yaml
@@ -0,0 +1,10 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemoguard-jailbreak-detect
+params:
+  - path: input
+    type: string
+    label: Input
+    description: The text to classify for jailbreak attempts. Accepts a string or an array of strings.
+    group: provider_metadata
diff --git a/models/nvidia/nemotron-3-nano-30b-a3b.yaml b/models/nvidia/nemotron-3-nano-30b-a3b.yaml
new file mode 100644
index 0000000..aaf9bcc
--- /dev/null
+++ b/models/nvidia/nemotron-3-nano-30b-a3b.yaml
@@ -0,0 +1,43 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemotron-3-nano-30b-a3b
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 16384
+    range:
+      min: 1
+      max: 32768
+    group: generation_length
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/nemotron-3-super-120b-a12b.yaml b/models/nvidia/nemotron-3-super-120b-a12b.yaml
new file mode 100644
index 0000000..1a81de2
--- /dev/null
+++ b/models/nvidia/nemotron-3-super-120b-a12b.yaml
@@ -0,0 +1,62 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemotron-3-super-120b-a12b
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 16384
+    range:
+      min: 1
+      max: 32768
+    group: generation_length
+  - path: reasoning_effort
+    type: enum
+    label: Reasoning effort
+    description: Controls the reasoning mode. 'none' disables reasoning tokens, 'low' enables low-effort reasoning, and 'high' enables full reasoning.
+    default: high
+    values:
+      - none
+      - low
+      - high
+    group: reasoning
+  - path: reasoning_budget
+    type: integer
+    label: Reasoning budget
+    description: Maximum number of tokens the model may use for internal reasoning before being forced to end the reasoning trace. Use -1 to disable budget enforcement.
+    default: 16384
+    range:
+      min: -1
+      max: 32768
+    group: reasoning
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/nemotron-3-ultra-550b-a55b.yaml b/models/nvidia/nemotron-3-ultra-550b-a55b.yaml
new file mode 100644
index 0000000..b7c2fef
--- /dev/null
+++ b/models/nvidia/nemotron-3-ultra-550b-a55b.yaml
@@ -0,0 +1,62 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemotron-3-ultra-550b-a55b
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 16384
+    range:
+      min: 1
+      max: 32768
+    group: generation_length
+  - path: reasoning_effort
+    type: enum
+    label: Reasoning effort
+    description: Controls the reasoning mode. 'none' disables reasoning tokens, 'medium' enables efficient reasoning, and 'high' enables full reasoning.
+    default: high
+    values:
+      - none
+      - medium
+      - high
+    group: reasoning
+  - path: reasoning_budget
+    type: integer
+    label: Reasoning budget
+    description: Maximum number of tokens the model may use for internal reasoning before being forced to end the reasoning trace. Use -1 to disable budget enforcement.
+    default: 16384
+    range:
+      min: -1
+      max: 32768
+    group: reasoning
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/nemotron-content-safety-reasoning-4b.yaml b/models/nvidia/nemotron-content-safety-reasoning-4b.yaml
new file mode 100644
index 0000000..7bfa8bb
--- /dev/null
+++ b/models/nvidia/nemotron-content-safety-reasoning-4b.yaml
@@ -0,0 +1,43 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemotron-content-safety-reasoning-4b
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 16384
+    range:
+      min: 1
+      max: 32768
+    group: generation_length
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/nemotron-mini-4b-instruct.yaml b/models/nvidia/nemotron-mini-4b-instruct.yaml
new file mode 100644
index 0000000..070d6bf
--- /dev/null
+++ b/models/nvidia/nemotron-mini-4b-instruct.yaml
@@ -0,0 +1,59 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemotron-mini-4b-instruct
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.2
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.7
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 1024
+    range:
+      min: 1
+      max: 4096
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
+  - path: tools
+    type: string
+    label: Tools
+    description: A list of tools the model may call. If no tools are provided, the model will not call any tools.
+    group: tooling
diff --git a/models/nvidia/riva-translate-4b-instruct-v1.1.yaml b/models/nvidia/riva-translate-4b-instruct-v1.1.yaml
new file mode 100644
index 0000000..420c6e0
--- /dev/null
+++ b/models/nvidia/riva-translate-4b-instruct-v1.1.yaml
@@ -0,0 +1,54 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: riva-translate-4b-instruct-v1.1
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.9
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 512
+    range:
+      min: 1
+      max: 4096
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/usdcode-llama-3.1-70b-instruct.yaml b/models/nvidia/usdcode-llama-3.1-70b-instruct.yaml
new file mode 100644
index 0000000..6f4aa2b
--- /dev/null
+++ b/models/nvidia/usdcode-llama-3.1-70b-instruct.yaml
@@ -0,0 +1,42 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: usdcode-llama-3.1-70b-instruct
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.1
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 1024
+    range:
+      min: 1
+      max: 2048
+    group: generation_length
+  - path: expert_type
+    type: enum
+    label: Expert type
+    description: The type of expert to use. 'knowledge' answers with USD knowledge, 'code' responds with vanilla OpenUSD code, 'helperfunction' uses high-level helper functions, and 'auto' lets the LLM determine which expert to use.
+    default: auto
+    values:
+      - auto
+      - code
+      - knowledge
+      - helperfunction
+    group: provider_metadata