andreasjansson / clip-features

Return CLIP features for the clip-vit-large-patch14 model

  • Public
  • 54.7M runs
  • GitHub

Input

Output

Run time and cost

This model runs on Nvidia T4 GPU hardware. Predictions typically complete within 1 seconds.

Readme

Cog model that outputs clip-vit-large-patch14 features for text and images.

Run with the API:

import replicate
import numpy as np
from numpy.linalg import norm

def cos_sim(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

inputs = """
a photo of a dog
a cat
two cats with remote controls
https://replicate.com/api/models/cjwbw/clip-vit-large-patch14/files/36b04aec-efe2-4dea-9c9d-a5faca68b2b2/000000039769.jpg
"""

# run prediction
model = replicate.models.get("andreasjansson/clip-features")
outputs = model.predict(inputs=inputs)

# output similarity of the three text lines with the image on line 4
for i in range(3):
    print(outputs[i]["input"])
    print(cos_sim(outputs[i]["embedding"], outputs[3]["embedding"]))
    print()
"""

# run prediction
model = replicate.models.get("andreasjansson/clip-features")
outputs = model.predict(inputs=inputs)

# output similarity of the three text lines with the image on line 4
for i in range(3):
    print(outputs[i].input)
    print(cos_sim(outputs[i].embedding, outputs[3].embedding))
    print()