prompthunt
/
cog-realvis-training
- Public
- 1 run
Run prompthunt/cog-realvis-training with an API
Use one of our client libraries to get started quickly. Clicking on a library will take you to the Playground tab where you can tweak different inputs, see the results, and copy the corresponding code to use in your own project.
Input schema
The fields you can use to run this model with an API. If you don't give a value for a field its default value will be used.
Field | Type | Default value | Description |
---|---|---|---|
input_images |
string
|
A .zip or .tar file containing the image files that will be used for fine-tuning
|
|
seed |
integer
|
Random seed for reproducible training. Leave empty to use a random seed
|
|
resolution |
integer
|
768
|
Square pixel resolution which your images will be resized to for training
|
train_batch_size |
integer
|
4
|
Batch size (per device) for training
|
num_train_epochs |
integer
|
4000
|
Number of epochs to loop through your training dataset
|
max_train_steps |
integer
|
1000
|
Number of individual training steps. Takes precedence over num_train_epochs
|
is_lora |
boolean
|
True
|
Whether to use LoRA training. If set to False, will use Full fine tuning
|
unet_learning_rate |
number
|
0.000001
|
Learning rate for the U-Net. We recommend this value to be somewhere between `1e-6` to `1e-5`.
|
ti_lr |
number
|
0.0003
|
Scaling of learning rate for training textual inversion embeddings. Don't alter unless you know what you're doing.
|
lora_lr |
number
|
0.0001
|
Scaling of learning rate for training LoRA embeddings. Don't alter unless you know what you're doing.
|
lora_rank |
integer
|
32
|
Rank of LoRA embeddings. Don't alter unless you know what you're doing.
|
lr_scheduler |
string
(enum)
|
constant
Options: constant, linear |
Learning rate scheduler to use for training
|
lr_warmup_steps |
integer
|
100
|
Number of warmup steps for lr schedulers with warmups.
|
token_string |
string
|
TOK
|
A unique string that will be trained to refer to the concept in the input images. Can be anything, but TOK works well
|
caption_prefix |
string
|
a photo of TOK,
|
Text which will be used as prefix during automatic captioning. Must contain the `token_string`. For example, if caption text is 'a photo of TOK', automatic captioning will expand to 'a photo of TOK under a bridge', 'a photo of TOK holding a cup', etc.
|
mask_target_prompts |
string
|
Prompt that describes part of the image that you will find important. For example, if you are fine-tuning your pet, `photo of a dog` will be a good prompt. Prompt-based masking is used to focus the fine-tuning process on the important/salient parts of the image
|
|
crop_based_on_salience |
boolean
|
True
|
If you want to crop the image to `target_size` based on the important parts of the image, set this to True. If you want to crop the image based on face detection, set this to False
|
use_face_detection_instead |
boolean
|
False
|
If you want to use face detection instead of CLIPSeg for masking. For face applications, we recommend using this option.
|
clipseg_temperature |
number
|
1
|
How blurry you want the CLIPSeg mask to be. We recommend this value be something between `0.5` to `1.0`. If you want to have more sharp mask (but thus more errorful), you can decrease this value.
|
verbose |
boolean
|
True
|
verbose output
|
checkpointing_steps |
integer
|
999999
|
Number of steps between saving checkpoints. Set to very very high number to disable checkpointing, because you don't need one.
|
input_images_filetype |
string
(enum)
|
infer
Options: zip, tar, infer |
Filetype of the input images. Can be either `zip` or `tar`. By default its `infer`, and it will be inferred from the ext of input file.
|
{
"type": "object",
"title": "Input",
"required": [
"input_images"
],
"properties": {
"seed": {
"type": "integer",
"title": "Seed",
"x-order": 1,
"description": "Random seed for reproducible training. Leave empty to use a random seed"
},
"ti_lr": {
"type": "number",
"title": "Ti Lr",
"default": 0.0003,
"x-order": 8,
"description": "Scaling of learning rate for training textual inversion embeddings. Don't alter unless you know what you're doing."
},
"is_lora": {
"type": "boolean",
"title": "Is Lora",
"default": true,
"x-order": 6,
"description": "Whether to use LoRA training. If set to False, will use Full fine tuning"
},
"lora_lr": {
"type": "number",
"title": "Lora Lr",
"default": 0.0001,
"x-order": 9,
"description": "Scaling of learning rate for training LoRA embeddings. Don't alter unless you know what you're doing."
},
"verbose": {
"type": "boolean",
"title": "Verbose",
"default": true,
"x-order": 19,
"description": "verbose output"
},
"lora_rank": {
"type": "integer",
"title": "Lora Rank",
"default": 32,
"x-order": 10,
"description": "Rank of LoRA embeddings. Don't alter unless you know what you're doing."
},
"resolution": {
"type": "integer",
"title": "Resolution",
"default": 768,
"x-order": 2,
"description": "Square pixel resolution which your images will be resized to for training"
},
"input_images": {
"type": "string",
"title": "Input Images",
"format": "uri",
"x-order": 0,
"description": "A .zip or .tar file containing the image files that will be used for fine-tuning"
},
"lr_scheduler": {
"enum": [
"constant",
"linear"
],
"type": "string",
"title": "lr_scheduler",
"description": "Learning rate scheduler to use for training",
"default": "constant",
"x-order": 11
},
"token_string": {
"type": "string",
"title": "Token String",
"default": "TOK",
"x-order": 13,
"description": "A unique string that will be trained to refer to the concept in the input images. Can be anything, but TOK works well"
},
"caption_prefix": {
"type": "string",
"title": "Caption Prefix",
"default": "a photo of TOK, ",
"x-order": 14,
"description": "Text which will be used as prefix during automatic captioning. Must contain the `token_string`. For example, if caption text is 'a photo of TOK', automatic captioning will expand to 'a photo of TOK under a bridge', 'a photo of TOK holding a cup', etc."
},
"lr_warmup_steps": {
"type": "integer",
"title": "Lr Warmup Steps",
"default": 100,
"x-order": 12,
"description": "Number of warmup steps for lr schedulers with warmups."
},
"max_train_steps": {
"type": "integer",
"title": "Max Train Steps",
"default": 1000,
"x-order": 5,
"description": "Number of individual training steps. Takes precedence over num_train_epochs"
},
"num_train_epochs": {
"type": "integer",
"title": "Num Train Epochs",
"default": 4000,
"x-order": 4,
"description": "Number of epochs to loop through your training dataset"
},
"train_batch_size": {
"type": "integer",
"title": "Train Batch Size",
"default": 4,
"x-order": 3,
"description": "Batch size (per device) for training"
},
"unet_learning_rate": {
"type": "number",
"title": "Unet Learning Rate",
"default": 1e-06,
"x-order": 7,
"description": "Learning rate for the U-Net. We recommend this value to be somewhere between `1e-6` to `1e-5`."
},
"checkpointing_steps": {
"type": "integer",
"title": "Checkpointing Steps",
"default": 999999,
"x-order": 20,
"description": "Number of steps between saving checkpoints. Set to very very high number to disable checkpointing, because you don't need one."
},
"clipseg_temperature": {
"type": "number",
"title": "Clipseg Temperature",
"default": 1,
"x-order": 18,
"description": "How blurry you want the CLIPSeg mask to be. We recommend this value be something between `0.5` to `1.0`. If you want to have more sharp mask (but thus more errorful), you can decrease this value."
},
"mask_target_prompts": {
"type": "string",
"title": "Mask Target Prompts",
"x-order": 15,
"description": "Prompt that describes part of the image that you will find important. For example, if you are fine-tuning your pet, `photo of a dog` will be a good prompt. Prompt-based masking is used to focus the fine-tuning process on the important/salient parts of the image"
},
"input_images_filetype": {
"enum": [
"zip",
"tar",
"infer"
],
"type": "string",
"title": "input_images_filetype",
"description": "Filetype of the input images. Can be either `zip` or `tar`. By default its `infer`, and it will be inferred from the ext of input file.",
"default": "infer",
"x-order": 21
},
"crop_based_on_salience": {
"type": "boolean",
"title": "Crop Based On Salience",
"default": true,
"x-order": 16,
"description": "If you want to crop the image to `target_size` based on the important parts of the image, set this to True. If you want to crop the image based on face detection, set this to False"
},
"use_face_detection_instead": {
"type": "boolean",
"title": "Use Face Detection Instead",
"default": false,
"x-order": 17,
"description": "If you want to use face detection instead of CLIPSeg for masking. For face applications, we recommend using this option."
}
}
}
Output schema
The shape of the response you’ll get when you run this model with an API.
Schema
{
"type": "string",
"title": "Output",
"format": "uri"
}