From e3b8119ab44752d8b004f931ddbd6dfc68c950a4 Mon Sep 17 00:00:00 2001
From: "Brian C. Van Essen" <vanessen1@llnl.gov>
Date: Thu, 6 Feb 2025 12:40:34 -0800
Subject: [PATCH 1/2] Small cleanups to enable running CePO on GPU systems.

---
 configs/cepo_config.yaml |  4 ++--
 optillm.py               | 16 ++++++++--------
 optillm/cepo.py          | 36 ++++++++++++++++++------------------
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/configs/cepo_config.yaml b/configs/cepo_config.yaml
index 3b2f55fd..9ccd7ace 100644
--- a/configs/cepo_config.yaml
+++ b/configs/cepo_config.yaml
@@ -6,8 +6,8 @@ planning_n: 3
 planning_m: 6
 planning_temperature_step1: 0.55
 planning_temperature_step2: 0.25
-planning_temperature_step3: 0.1
-planning_temperature_step4: 0
+planning_temperature_step3: 0.10
+planning_temperature_step4: 0.01
 planning_max_tokens_step1: 4096
 planning_max_tokens_step2: 4096
 planning_max_tokens_step3: 4096
diff --git a/optillm.py b/optillm.py
index 5bc2ddc6..f909d362 100644
--- a/optillm.py
+++ b/optillm.py
@@ -4,7 +4,7 @@
 import os
 import secrets
 from flask import Flask, request, jsonify
-from cerebras.cloud.sdk import Cerebras
+# from cerebras.cloud.sdk import Cerebras
 from openai import AzureOpenAI, OpenAI
 from flask import Response
 import json
@@ -55,13 +55,13 @@ def get_config():
         API_KEY = os.environ.get("OPTILLM_API_KEY")
         default_client = create_inference_client()
     # Cerebras, OpenAI, Azure, or LiteLLM API configuration
-    elif os.environ.get("CEREBRAS_API_KEY"):
-        API_KEY = os.environ.get("CEREBRAS_API_KEY")
-        base_url = server_config['base_url']
-        if base_url != "":
-            default_client = Cerebras(api_key=API_KEY, base_url=base_url)
-        else:
-            default_client = Cerebras(api_key=API_KEY)
+    # elif os.environ.get("CEREBRAS_API_KEY"):
+    #     API_KEY = os.environ.get("CEREBRAS_API_KEY")
+    #     base_url = server_config['base_url']
+    #     if base_url != "":
+    #         default_client = Cerebras(api_key=API_KEY, base_url=base_url)
+    #     else:
+    #         default_client = Cerebras(api_key=API_KEY)
     elif os.environ.get("OPENAI_API_KEY"):
         API_KEY = os.environ.get("OPENAI_API_KEY")
         base_url = server_config['base_url']
diff --git a/optillm/cepo.py b/optillm/cepo.py
index c73b901f..32e48684 100644
--- a/optillm/cepo.py
+++ b/optillm/cepo.py
@@ -1,31 +1,31 @@
 # Apache license 2 - added after the fork for the CePO method
 import re
-import cerebras
+# import cerebras
 import openai
 import yaml
 
 from dataclasses import dataclass
-from cerebras.cloud.sdk import BadRequestError as CerebrasBadRequestError
+# from cerebras.cloud.sdk import BadRequestError as CerebrasBadRequestError
 from openai import BadRequestError as OpenAIBadRequestError
-from typing import Optional, Literal
+from typing import Optional, Literal, Any
 
 
 @dataclass
 class CepoConfig:
-    bestofn_n: int  # number of responses to be generated in best of n stage
-    bestofn_temperature: float  # temperature for verifier in best of n stage
-    bestofn_max_tokens: int  # maximum number of tokens for verifier in best of n stage
-    bestofn_rating_type: Literal["absolute", "pairwise"]  # type of rating in best of n stage
-    planning_n: int  # number of plans generated in planning stage
-    planning_m: int  # number of attempts to generate n plans in planning stage
-    planning_temperature_step1: float  # temperature for generator in step 1 of planning stage
-    planning_temperature_step2: float  # temperature for generator in step 2 of planning stage
-    planning_temperature_step3: float  # temperature for generator in step 3 of planning stage
-    planning_temperature_step4: float  # temperature for generator in step 4 of planning stage
-    planning_max_tokens_step1: int  # maximum number of tokens in step 1 of planning stage
-    planning_max_tokens_step2: int  # maximum number of tokens in step 2 of planning stage
-    planning_max_tokens_step3: int  # maximum number of tokens in step 3 of planning stage
-    planning_max_tokens_step4: int  # maximum number of tokens in step 4 of planning stage
+    bestofn_n: int = 3  # number of responses to be generated in best of n stage
+    bestofn_temperature: float = 0.1  # temperature for verifier in best of n stage
+    bestofn_max_tokens: int = 4096 # maximum number of tokens for verifier in best of n stage
+    bestofn_rating_type: Literal["absolute", "pairwise"] = "absolute" # type of rating in best of n stage
+    planning_n: int = 3 # number of plans generated in planning stage
+    planning_m: int = 6 # number of attempts to generate n plans in planning stage
+    planning_temperature_step1: float = 0.55 # temperature for generator in step 1 of planning stage
+    planning_temperature_step2: float = 0.25 # temperature for generator in step 2 of planning stage
+    planning_temperature_step3: float = 0.10 # temperature for generator in step 3 of planning stage
+    planning_temperature_step4: float = 0.01 # temperature for generator in step 4 of planning stage
+    planning_max_tokens_step1: int = 4096 # maximum number of tokens in step 1 of planning stage
+    planning_max_tokens_step2: int = 4096 # maximum number of tokens in step 2 of planning stage
+    planning_max_tokens_step3: int = 4096 # maximum number of tokens in step 3 of planning stage
+    planning_max_tokens_step4: int = 4096 # maximum number of tokens in step 4 of planning stage
 
 
 # given command line arguments which includes a yaml file path, initialize a CePO configuration
@@ -183,7 +183,7 @@ def generate_completion(system_prompt: str, task: str, client: Any, model: str,
         )
         final_solution = response.choices[0].message.content
         completion_tokens += response.usage.completion_tokens
-    except (CerebrasBadRequestError, OpenAIBadRequestError) as e:
+    except (OpenAIBadRequestError) as e:
         # In case of an error, take the first plan as the final solution
         final_solution = plans[0]
         messages = []

From 6ca116f482b483d9ec48e5c75c636bf069689d58 Mon Sep 17 00:00:00 2001
From: "Brian C. Van Essen" <vanessen1@llnl.gov>
Date: Fri, 7 Feb 2025 11:12:28 -0800
Subject: [PATCH 2/2] Fixed a bug in how the & and | operators were parsed.

---
 optillm.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/optillm.py b/optillm.py
index f909d362..e59e39f6 100644
--- a/optillm.py
+++ b/optillm.py
@@ -231,10 +231,12 @@ def parse_combined_approach(model: str, known_approaches: list, plugin_approache
                 approaches.append(part)
             elif '&' in part:
                 operation = 'AND'
-                approaches.extend(part.split('&'))
+                for approach in part.split('&'):
+                    approaches.append(approach.strip())
             elif '|' in part:
                 operation = 'OR'
-                approaches.extend(part.split('|'))
+                for approach in part.split('|'):
+                    approaches.append(approach.strip())
             else:
                 parsing_approaches = False
                 model_parts.append(part)
@@ -518,6 +520,7 @@ def proxy():
     n = data.get('n', server_config['n'])  # Get n value from request or config
 
     optillm_approach = data.get('optillm_approach', server_config['approach'])
+    # print(f'BVE - I think that the approach is {optillm_approach}')
     logger.debug(data)
     server_config['mcts_depth'] = data.get('mcts_depth', server_config['mcts_depth'])
     server_config['mcts_exploration'] = data.get('mcts_exploration', server_config['mcts_exploration'])
@@ -535,6 +538,7 @@ def proxy():
     default_client, api_key = get_config()
 
     operation, approaches, model = parse_combined_approach(model, known_approaches, plugin_approaches)
+    # print(f'BVE Checking for the combined approach {operation} and {approaches} with models {model}')
     logger.info(f'Using approach(es) {approaches}, operation {operation}, with model {model}')
 
     if bearer_token != "" and bearer_token.startswith("sk-"):
@@ -594,6 +598,7 @@ def proxy():
         if isinstance(messages, list) and messages:  # Only process if format changed
             response = messages[-1]['content']
 
+    print(f'I think that we have stream {stream}')
     if stream:
         return Response(generate_streaming_response(response, model), content_type='text/event-stream')
     else: