Spaces:

Honl
/

opendesk-api

Build error

App Files Files Community

Honl commited on Dec 24, 2025

Commit

278e8da

verified ·

1 Parent(s): 6953dbd

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -0

app.py CHANGED Viewed

	@@ -0,0 +1,113 @@

+import gradio as gr
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+# Model configuration
+MODEL_NAME = "qwen2.5-7b-instruct-q4_k_m.gguf"
+MODEL_REPO = "Qwen/Qwen2.5-7B-Instruct-GGUF"
+def load_model():
+    """Download and load the GGUF model with CPU optimizations."""
+    print(f"Downloading model {MODEL_NAME} from {MODEL_REPO}...")
+    # Download the model (cached after first download)
+    model_path = hf_hub_download(
+        repo_id=MODEL_REPO,
+        filename=MODEL_NAME,
+        resume_download=True
+    )
+    print(f"Model downloaded to: {model_path}")
+    print("Loading model... This may take 30-60 seconds.")
+    # Initialize Llama with CPU optimizations
+    llm = Llama(
+        model_path=model_path,
+        n_ctx=2048,        # Context length (user requirement)
+        n_threads=8,       # CPU threads for inference (user requirement)
+        n_batch=512,       # Batch size for processing
+        verbose=False,     # Suppress logging
+        # CPU optimization flags
+        use_mmap=True,     # Memory mapping for faster loading
+        use_mlock=False,   # Don't lock memory (not needed for CPU tier)
+    )
+    print("Model loaded successfully!")
+    return llm
+# Global model instance (loaded at startup)
+llm = load_model()
+def generate_response(message, history):
+    """
+    Generate a response using the Qwen model.
+    Args:
+        message: The latest user message
+        history: List of previous conversation pairs
+    Returns:
+        str: Generated assistant response
+    """
+    try:
+        # Build message list from history
+        messages = []
+        # Add system message for instruction following
+        messages.append({
+            "role": "system",
+            "content": "You are Qwen, a helpful assistant."
+        })
+        # Add conversation history
+        for user_msg, assistant_msg in history:
+            if user_msg:
+                messages.append({"role": "user", "content": user_msg})
+            if assistant_msg:
+                messages.append({"role": "assistant", "content": assistant_msg})
+        # Add current user message
+        messages.append({"role": "user", "content": message})
+        # Generate response
+        response = llm.create_chat_completion(
+            messages=messages,
+            max_tokens=512,
+            temperature=0.7,
+            top_p=0.95,
+            stream=False
+        )
+        # Extract and return the assistant's message
+        return response["choices"][0]["message"]["content"]
+    except Exception as e:
+        print(f"Error generating response: {e}")
+        return f"Sorry, an error occurred: {str(e)}"
+# Create Gradio ChatInterface
+demo = gr.ChatInterface(
+    fn=generate_response,
+    title="Qwen2.5-7B-Instruct Private API",
+    description=(
+        "A private API server running Qwen2.5-7B-Instruct-GGUF (Q4_K_M) on CPU. "
+        "Model loading may take 30-60 seconds on first run. "
+        f"Context length: 2048 tokens, Threads: 8"
+    ),
+    theme=gr.themes.Base(),
+    examples=[
+        ["What is machine learning?"],
+        ["Write a Python function to calculate fibonacci numbers"],
+        ["Explain quantum computing in simple terms"],
+    ],
+    cache_examples=False,
+)
+if __name__ == "__main__":
+    # Launch the Gradio app
+    # In HF Spaces, this will run on the public URL (set Space to Private for privacy)
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )