chore: the consistency of MultiModalPromptMessageContent (#11721)

2025-12-09 19:06:51 +08:00 · 2024-12-17 15:01:38 +08:00
parent 78c3051585
commit c9b4029ce7
14 changed files with 108 additions and 99 deletions
--- a/api/core/model_runtime/model_providers/anthropic/llm/llm.py
+++ b/api/core/model_runtime/model_providers/anthropic/llm/llm.py
@@ -1,5 +1,4 @@
 import base64
-import io
 import json
 from collections.abc import Generator, Sequence
 from typing import Optional, Union, cast
@@ -18,7 +17,6 @@ from anthropic.types import (
 )
 from anthropic.types.beta.tools import ToolsBetaMessage
 from httpx import Timeout
-from PIL import Image

 from core.model_runtime.callbacks.base_callback import Callback
 from core.model_runtime.entities import (
@@ -498,22 +496,19 @@ class AnthropicLargeLanguageModel(LargeLanguageModel):
                                sub_messages.append(sub_message_dict)
                            elif message_content.type == PromptMessageContentType.IMAGE:
                                message_content = cast(ImagePromptMessageContent, message_content)
-                                if not message_content.data.startswith("data:"):
+                                if not message_content.base64_data:
                                    # fetch image data from url
                                    try:
-                                        image_content = requests.get(message_content.data).content
-                                        with Image.open(io.BytesIO(image_content)) as img:
-                                            mime_type = f"image/{img.format.lower()}"
+                                        image_content = requests.get(message_content.url).content
                                        base64_data = base64.b64encode(image_content).decode("utf-8")
                                    except Exception as ex:
                                        raise ValueError(
                                            f"Failed to fetch image data from url {message_content.data}, {ex}"
                                        )
                                else:
-                                    data_split = message_content.data.split(";base64,")
-                                    mime_type = data_split[0].replace("data:", "")
-                                    base64_data = data_split[1]
+                                    base64_data = message_content.base64_data

+                                mime_type = message_content.mime_type
                                if mime_type not in {"image/jpeg", "image/png", "image/gif", "image/webp"}:
                                    raise ValueError(
                                        f"Unsupported image type {mime_type}, "
@@ -526,19 +521,17 @@ class AnthropicLargeLanguageModel(LargeLanguageModel):
                                }
                                sub_messages.append(sub_message_dict)
                            elif isinstance(message_content, DocumentPromptMessageContent):
-                                data_split = message_content.data.split(";base64,")
-                                mime_type = data_split[0].replace("data:", "")
-                                base64_data = data_split[1]
-                                if mime_type != "application/pdf":
+                                if message_content.mime_type != "application/pdf":
                                    raise ValueError(
-                                        f"Unsupported document type {mime_type}, " "only support application/pdf"
+                                        f"Unsupported document type {message_content.mime_type}, "
+                                        "only support application/pdf"
                                    )
                                sub_message_dict = {
                                    "type": "document",
                                    "source": {
-                                        "type": message_content.encode_format,
-                                        "media_type": mime_type,
-                                        "data": base64_data,
+                                        "type": "base64",
+                                        "media_type": message_content.mime_type,
+                                        "data": message_content.data,
                                    },
                                }
                                sub_messages.append(sub_message_dict)
--- a/api/core/model_runtime/model_providers/tongyi/llm/llm.py
+++ b/api/core/model_runtime/model_providers/tongyi/llm/llm.py
@@ -434,9 +434,9 @@ class TongyiLargeLanguageModel(LargeLanguageModel):
                            sub_messages.append(sub_message_dict)
                        elif message_content.type == PromptMessageContentType.VIDEO:
                            message_content = cast(VideoPromptMessageContent, message_content)
-                            video_url = message_content.data
-                            if message_content.data.startswith("data:"):
-                                raise InvokeError("not support base64, please set MULTIMODAL_SEND_VIDEO_FORMAT to url")
+                            video_url = message_content.url
+                            if not video_url:
+                                raise InvokeError("not support base64, please set MULTIMODAL_SEND_FORMAT to url")

                            sub_message_dict = {"video": video_url}
                            sub_messages.append(sub_message_dict)