Honl commited on
Commit
278e8da
·
verified ·
1 Parent(s): 6953dbd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py CHANGED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import hf_hub_download
3
+ from llama_cpp import Llama
4
+
5
+ # Model configuration
6
+ MODEL_NAME = "qwen2.5-7b-instruct-q4_k_m.gguf"
7
+ MODEL_REPO = "Qwen/Qwen2.5-7B-Instruct-GGUF"
8
+
9
+ def load_model():
10
+ """Download and load the GGUF model with CPU optimizations."""
11
+ print(f"Downloading model {MODEL_NAME} from {MODEL_REPO}...")
12
+
13
+ # Download the model (cached after first download)
14
+ model_path = hf_hub_download(
15
+ repo_id=MODEL_REPO,
16
+ filename=MODEL_NAME,
17
+ resume_download=True
18
+ )
19
+
20
+ print(f"Model downloaded to: {model_path}")
21
+ print("Loading model... This may take 30-60 seconds.")
22
+
23
+ # Initialize Llama with CPU optimizations
24
+ llm = Llama(
25
+ model_path=model_path,
26
+ n_ctx=2048, # Context length (user requirement)
27
+ n_threads=8, # CPU threads for inference (user requirement)
28
+ n_batch=512, # Batch size for processing
29
+ verbose=False, # Suppress logging
30
+ # CPU optimization flags
31
+ use_mmap=True, # Memory mapping for faster loading
32
+ use_mlock=False, # Don't lock memory (not needed for CPU tier)
33
+ )
34
+
35
+ print("Model loaded successfully!")
36
+ return llm
37
+
38
+ # Global model instance (loaded at startup)
39
+ llm = load_model()
40
+
41
+ def generate_response(message, history):
42
+ """
43
+ Generate a response using the Qwen model.
44
+
45
+ Args:
46
+ message: The latest user message
47
+ history: List of previous conversation pairs
48
+
49
+ Returns:
50
+ str: Generated assistant response
51
+ """
52
+ try:
53
+ # Build message list from history
54
+ messages = []
55
+
56
+ # Add system message for instruction following
57
+ messages.append({
58
+ "role": "system",
59
+ "content": "You are Qwen, a helpful assistant."
60
+ })
61
+
62
+ # Add conversation history
63
+ for user_msg, assistant_msg in history:
64
+ if user_msg:
65
+ messages.append({"role": "user", "content": user_msg})
66
+ if assistant_msg:
67
+ messages.append({"role": "assistant", "content": assistant_msg})
68
+
69
+ # Add current user message
70
+ messages.append({"role": "user", "content": message})
71
+
72
+ # Generate response
73
+ response = llm.create_chat_completion(
74
+ messages=messages,
75
+ max_tokens=512,
76
+ temperature=0.7,
77
+ top_p=0.95,
78
+ stream=False
79
+ )
80
+
81
+ # Extract and return the assistant's message
82
+ return response["choices"][0]["message"]["content"]
83
+
84
+ except Exception as e:
85
+ print(f"Error generating response: {e}")
86
+ return f"Sorry, an error occurred: {str(e)}"
87
+
88
+ # Create Gradio ChatInterface
89
+ demo = gr.ChatInterface(
90
+ fn=generate_response,
91
+ title="Qwen2.5-7B-Instruct Private API",
92
+ description=(
93
+ "A private API server running Qwen2.5-7B-Instruct-GGUF (Q4_K_M) on CPU. "
94
+ "Model loading may take 30-60 seconds on first run. "
95
+ f"Context length: 2048 tokens, Threads: 8"
96
+ ),
97
+ theme=gr.themes.Base(),
98
+ examples=[
99
+ ["What is machine learning?"],
100
+ ["Write a Python function to calculate fibonacci numbers"],
101
+ ["Explain quantum computing in simple terms"],
102
+ ],
103
+ cache_examples=False,
104
+ )
105
+
106
+ if __name__ == "__main__":
107
+ # Launch the Gradio app
108
+ # In HF Spaces, this will run on the public URL (set Space to Private for privacy)
109
+ demo.launch(
110
+ server_name="0.0.0.0",
111
+ server_port=7860,
112
+ share=False
113
+ )