tencent_ocr/.history/pkg/handler/ocr_20250115141957.go
2025-01-15 16:01:18 +08:00

166 lines
4.3 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package handler
import (
"encoding/base64"
"net/http"
"strings"
"github.com/gin-gonic/gin"
"github.com/google/generative-ai-go/genai"
"github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/common"
"github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/common/profile"
ocr "github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/ocr/v20181119"
"google.golang.org/api/option"
)
type OCRHandler struct {
tencentSecretID string
tencentSecretKey string
geminiAPIKey string
apiKey string
}
type OCRRequest struct {
ImageBase64 string `json:"image_base64"`
ImageURL string `json:"image_url"`
Scene string `json:"scene"`
APIKey string `json:"apikey" binding:"required"`
}
type OCRResponse struct {
OriginalText string `json:"original_text"`
Result string `json:"result"`
Success bool `json:"success"`
}
func NewOCRHandler(tencentSecretID, tencentSecretKey, geminiAPIKey, apiKey string) *OCRHandler {
return &OCRHandler{
tencentSecretID: tencentSecretID,
tencentSecretKey: tencentSecretKey,
geminiAPIKey: geminiAPIKey,
apiKey: apiKey,
}
}
func (h *OCRHandler) HandleOCR(c *gin.Context) {
var req OCRRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, OCRResponse{
Success: false,
Result: "Invalid request format",
})
return
}
// Validate API key
if req.APIKey != h.apiKey {
c.JSON(http.StatusUnauthorized, OCRResponse{
Success: false,
Result: "Invalid API key",
})
return
}
// Validate that at least one of ImageURL or ImageBase64 is provided
if req.ImageURL == "" && req.ImageBase64 == "" {
c.JSON(http.StatusBadRequest, OCRResponse{
Success: false,
Result: "Either image_url or image_base64 must be provided",
})
return
}
// Initialize Tencent Cloud client
credential := common.NewCredential(h.tencentSecretID, h.tencentSecretKey)
cpf := profile.NewClientProfile()
cpf.HttpProfile.Endpoint = "ocr.tencentcloudapi.com"
client, err := ocr.NewClient(credential, "", cpf)
if err != nil {
c.JSON(http.StatusInternalServerError, OCRResponse{
Success: false,
Result: "Failed to initialize OCR client",
})
return
}
// Create OCR request
request := ocr.NewGeneralHandwritingOCRRequest()
// Prioritize ImageURL if both are provided
if req.ImageURL != "" {
request.ImageUrl = common.StringPtr(req.ImageURL)
} else {
// Remove base64 prefix if exists
imageBase64 := req.ImageBase64
if idx := strings.Index(imageBase64, "base64,"); idx != -1 {
imageBase64 = imageBase64[idx+7:] // 7 is the length of "base64,"
}
// Validate base64
if _, err := base64.StdEncoding.DecodeString(imageBase64); err != nil {
c.JSON(http.StatusBadRequest, OCRResponse{
Success: false,
Result: "Invalid base64 image",
})
return
}
request.ImageBase64 = common.StringPtr(imageBase64)
}
if req.Scene != "" {
request.Scene = common.StringPtr(req.Scene)
}
// Perform OCR
response, err := client.GeneralHandwritingOCR(request)
if err != nil {
c.JSON(http.StatusInternalServerError, OCRResponse{
Success: false,
Result: "OCR processing failed",
})
return
}
// Extract text from OCR response
var ocrText string
for _, textDetection := range response.Response.TextDetections {
ocrText += *textDetection.DetectedText + "\n"
}
// Process with Gemini
ctx := c.Request.Context()
client2, err := genai.NewClient(ctx, option.WithAPIKey(h.geminiAPIKey))
if err != nil {
c.JSON(http.StatusInternalServerError, OCRResponse{
Success: false,
Result: "Failed to initialize Gemini client",
})
return
}
defer client2.Close()
model := client2.GenerativeModel("gemini-2.0-flash-exp")
prompt := "你是一个专业的助手负责纠正OCR识别结果中的文本。只需要输出识别结果不需要输出任何解释。\n\n" + ocrText
resp, err := model.GenerateContent(ctx, genai.Text(prompt))
if err != nil {
c.JSON(http.StatusInternalServerError, OCRResponse{
Success: false,
Result: "Text processing failed",
})
return
}
// Get the processed text from Gemini response
processedText := ""
if len(resp.Candidates) > 0 && len(resp.Candidates[0].Content.Parts) > 0 {
if textPart, ok := resp.Candidates[0].Content.Parts[0].(genai.Text); ok {
processedText = string(textPart)
}
}
c.JSON(http.StatusOK, OCRResponse{
Success: true,
OriginalText: ocrText,
Result: processedText,
})
}