From e3b8119ab44752d8b004f931ddbd6dfc68c950a4 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 6 Feb 2025 12:40:34 -0800 Subject: [PATCH 1/2] Small cleanups to enable running CePO on GPU systems. --- configs/cepo_config.yaml | 4 ++-- optillm.py | 16 ++++++++-------- optillm/cepo.py | 36 ++++++++++++++++++------------------ 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/configs/cepo_config.yaml b/configs/cepo_config.yaml index 3b2f55fd..9ccd7ace 100644 --- a/configs/cepo_config.yaml +++ b/configs/cepo_config.yaml @@ -6,8 +6,8 @@ planning_n: 3 planning_m: 6 planning_temperature_step1: 0.55 planning_temperature_step2: 0.25 -planning_temperature_step3: 0.1 -planning_temperature_step4: 0 +planning_temperature_step3: 0.10 +planning_temperature_step4: 0.01 planning_max_tokens_step1: 4096 planning_max_tokens_step2: 4096 planning_max_tokens_step3: 4096 diff --git a/optillm.py b/optillm.py index 5bc2ddc6..f909d362 100644 --- a/optillm.py +++ b/optillm.py @@ -4,7 +4,7 @@ import os import secrets from flask import Flask, request, jsonify -from cerebras.cloud.sdk import Cerebras +# from cerebras.cloud.sdk import Cerebras from openai import AzureOpenAI, OpenAI from flask import Response import json @@ -55,13 +55,13 @@ def get_config(): API_KEY = os.environ.get("OPTILLM_API_KEY") default_client = create_inference_client() # Cerebras, OpenAI, Azure, or LiteLLM API configuration - elif os.environ.get("CEREBRAS_API_KEY"): - API_KEY = os.environ.get("CEREBRAS_API_KEY") - base_url = server_config['base_url'] - if base_url != "": - default_client = Cerebras(api_key=API_KEY, base_url=base_url) - else: - default_client = Cerebras(api_key=API_KEY) + # elif os.environ.get("CEREBRAS_API_KEY"): + # API_KEY = os.environ.get("CEREBRAS_API_KEY") + # base_url = server_config['base_url'] + # if base_url != "": + # default_client = Cerebras(api_key=API_KEY, base_url=base_url) + # else: + # default_client = Cerebras(api_key=API_KEY) elif os.environ.get("OPENAI_API_KEY"): API_KEY = os.environ.get("OPENAI_API_KEY") base_url = server_config['base_url'] diff --git a/optillm/cepo.py b/optillm/cepo.py index c73b901f..32e48684 100644 --- a/optillm/cepo.py +++ b/optillm/cepo.py @@ -1,31 +1,31 @@ # Apache license 2 - added after the fork for the CePO method import re -import cerebras +# import cerebras import openai import yaml from dataclasses import dataclass -from cerebras.cloud.sdk import BadRequestError as CerebrasBadRequestError +# from cerebras.cloud.sdk import BadRequestError as CerebrasBadRequestError from openai import BadRequestError as OpenAIBadRequestError -from typing import Optional, Literal +from typing import Optional, Literal, Any @dataclass class CepoConfig: - bestofn_n: int # number of responses to be generated in best of n stage - bestofn_temperature: float # temperature for verifier in best of n stage - bestofn_max_tokens: int # maximum number of tokens for verifier in best of n stage - bestofn_rating_type: Literal["absolute", "pairwise"] # type of rating in best of n stage - planning_n: int # number of plans generated in planning stage - planning_m: int # number of attempts to generate n plans in planning stage - planning_temperature_step1: float # temperature for generator in step 1 of planning stage - planning_temperature_step2: float # temperature for generator in step 2 of planning stage - planning_temperature_step3: float # temperature for generator in step 3 of planning stage - planning_temperature_step4: float # temperature for generator in step 4 of planning stage - planning_max_tokens_step1: int # maximum number of tokens in step 1 of planning stage - planning_max_tokens_step2: int # maximum number of tokens in step 2 of planning stage - planning_max_tokens_step3: int # maximum number of tokens in step 3 of planning stage - planning_max_tokens_step4: int # maximum number of tokens in step 4 of planning stage + bestofn_n: int = 3 # number of responses to be generated in best of n stage + bestofn_temperature: float = 0.1 # temperature for verifier in best of n stage + bestofn_max_tokens: int = 4096 # maximum number of tokens for verifier in best of n stage + bestofn_rating_type: Literal["absolute", "pairwise"] = "absolute" # type of rating in best of n stage + planning_n: int = 3 # number of plans generated in planning stage + planning_m: int = 6 # number of attempts to generate n plans in planning stage + planning_temperature_step1: float = 0.55 # temperature for generator in step 1 of planning stage + planning_temperature_step2: float = 0.25 # temperature for generator in step 2 of planning stage + planning_temperature_step3: float = 0.10 # temperature for generator in step 3 of planning stage + planning_temperature_step4: float = 0.01 # temperature for generator in step 4 of planning stage + planning_max_tokens_step1: int = 4096 # maximum number of tokens in step 1 of planning stage + planning_max_tokens_step2: int = 4096 # maximum number of tokens in step 2 of planning stage + planning_max_tokens_step3: int = 4096 # maximum number of tokens in step 3 of planning stage + planning_max_tokens_step4: int = 4096 # maximum number of tokens in step 4 of planning stage # given command line arguments which includes a yaml file path, initialize a CePO configuration @@ -183,7 +183,7 @@ def generate_completion(system_prompt: str, task: str, client: Any, model: str, ) final_solution = response.choices[0].message.content completion_tokens += response.usage.completion_tokens - except (CerebrasBadRequestError, OpenAIBadRequestError) as e: + except (OpenAIBadRequestError) as e: # In case of an error, take the first plan as the final solution final_solution = plans[0] messages = [] From 6ca116f482b483d9ec48e5c75c636bf069689d58 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 7 Feb 2025 11:12:28 -0800 Subject: [PATCH 2/2] Fixed a bug in how the & and | operators were parsed. --- optillm.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/optillm.py b/optillm.py index f909d362..e59e39f6 100644 --- a/optillm.py +++ b/optillm.py @@ -231,10 +231,12 @@ def parse_combined_approach(model: str, known_approaches: list, plugin_approache approaches.append(part) elif '&' in part: operation = 'AND' - approaches.extend(part.split('&')) + for approach in part.split('&'): + approaches.append(approach.strip()) elif '|' in part: operation = 'OR' - approaches.extend(part.split('|')) + for approach in part.split('|'): + approaches.append(approach.strip()) else: parsing_approaches = False model_parts.append(part) @@ -518,6 +520,7 @@ def proxy(): n = data.get('n', server_config['n']) # Get n value from request or config optillm_approach = data.get('optillm_approach', server_config['approach']) + # print(f'BVE - I think that the approach is {optillm_approach}') logger.debug(data) server_config['mcts_depth'] = data.get('mcts_depth', server_config['mcts_depth']) server_config['mcts_exploration'] = data.get('mcts_exploration', server_config['mcts_exploration']) @@ -535,6 +538,7 @@ def proxy(): default_client, api_key = get_config() operation, approaches, model = parse_combined_approach(model, known_approaches, plugin_approaches) + # print(f'BVE Checking for the combined approach {operation} and {approaches} with models {model}') logger.info(f'Using approach(es) {approaches}, operation {operation}, with model {model}') if bearer_token != "" and bearer_token.startswith("sk-"): @@ -594,6 +598,7 @@ def proxy(): if isinstance(messages, list) and messages: # Only process if format changed response = messages[-1]['content'] + print(f'I think that we have stream {stream}') if stream: return Response(generate_streaming_response(response, model), content_type='text/event-stream') else: