VectorSpaceLab
diff --git a/‎editscore/__init__.py‎
Lines changed: 65 additions & 4 deletions b/‎editscore/__init__.py‎
Lines changed: 65 additions & 4 deletions
diff --git a/‎editscore/json_parser.py‎
Lines changed: 173 additions & 0 deletions b/‎editscore/json_parser.py‎
Lines changed: 173 additions & 0 deletions
diff --git a/‎editscore/mllm_tools/qwen25vl_vllm.py‎
Lines changed: 22 additions & 3 deletions b/‎editscore/mllm_tools/qwen25vl_vllm.py‎
Lines changed: 22 additions & 3 deletions
@@ -8,6 +8,7 @@
 import math
 from . import vie_prompts
 import numpy as np
+from .json_parser import parse_vlm_output_to_dict
 
 class EditScore:
     def __init__(
@@ -91,16 +92,16 @@ def evaluate(self, image_prompts, text_prompt):
             max_tries = 2
             while SC_dict is False or PQ_dict is False:
                 tries += 1
-                guess_if_cannot_parse = True if tries > max_tries else False
+                give_up_parsing = True if tries > max_tries else False
 
                 result_SC = self.model.inference(SC_prompt_final, seed=self.seed + i)
                 result_PQ = self.model.inference(PQ_prompt_final, seed=self.seed + i)
 
                 if result_SC in ["I'm sorry, but I can't assist with that request."] or result_PQ in ["I'm sorry, but I can't assist with that request."]:
-                    guess_if_cannot_parse = True
+                    give_up_parsing = True
 
-                SC_dict = mllm_output_to_dict(result_SC, give_up_parsing=guess_if_cannot_parse, text_prompt=text_prompt, score_range=self.score_range)
-                PQ_dict = mllm_output_to_dict(result_PQ, give_up_parsing=guess_if_cannot_parse, text_prompt=text_prompt, score_range=self.score_range)
+                SC_dict = mllm_output_to_dict(result_SC, give_up_parsing=give_up_parsing, text_prompt=text_prompt, score_range=self.score_range)
+                PQ_dict = mllm_output_to_dict(result_PQ, give_up_parsing=give_up_parsing, text_prompt=text_prompt, score_range=self.score_range)
 
             if SC_dict == "rate_limit_exceeded" or PQ_dict == "rate_limit_exceeded":
                 print("rate_limit_exceeded") 
@@ -136,3 +137,63 @@ def evaluate(self, image_prompts, text_prompt):
         if self.reduction == "average_first":
             output["overall"] = math.sqrt(output["prompt_following"] * output["perceptual_quality"])
         return output
+
+
+    def batch_evaluate(self, image_prompts, text_prompt):
+        SC_prompt = [self.SC_prompt.replace("<instruction>", _text_prompt) for _text_prompt in text_prompt]
+
+        SC_prompt = [self.model.prepare_input(image_prompt, _SC_prompt) for image_prompt, _SC_prompt in zip(image_prompts, SC_prompt)]
+        PQ_prompt = [self.model.prepare_input(image_prompt, self.PQ_prompt) for image_prompt in image_prompts]
+
+        outputs_multi_pass = [[] for _ in range(len(image_prompts))]
+        for i in range(self.num_pass):
+            results = self.model.batch_inference(SC_prompt + PQ_prompt, seed=self.seed + i)
+
+            SC_evaluations = [parse_vlm_output_to_dict(results[i]) for i in range(len(results) // 2)]
+            PQ_evaluations = [parse_vlm_output_to_dict(results[i]) for i in range(len(results) // 2, len(results))]
+
+            for idx, (SC_evaluation, PQ_evaluation) in enumerate(zip(SC_evaluations, PQ_evaluations)):
+                SC_scores = SC_evaluation["score"]
+                PQ_scores = PQ_evaluation["score"]
+
+                if len(SC_scores) == 0:
+                    SC_scores = [self.score_range / 2]
+                if len(PQ_scores) == 0:
+                    PQ_scores = [self.score_range / 2]
+
+                SC_score = min(SC_scores) / (self.score_range / 10)
+                PQ_score = min(PQ_scores) / (self.score_range / 10)
+                if SC_score < 0 or SC_score > 10:
+                    SC_score = self.score_range / 2
+                if PQ_score < 0 or PQ_score > 10:
+                    PQ_score = self.score_range / 2
+                O_score = math.sqrt(SC_score * PQ_score)
+
+                outputs_multi_pass[idx].append(
+                    {
+                        "SC_score": SC_score,
+                        "PQ_score": PQ_score,
+                        "O_score": O_score,
+                        "SC_score_reasoning": SC_evaluation["reasoning"],
+                        "PQ_score_reasoning": PQ_evaluation["reasoning"],
+                        "SC_raw_output": results[idx],
+                        "PQ_raw_output": results[len(results) // 2 + idx],
+                    }
+                )
+        
+        outputs = []
+        for idx, outputs_per_prompt in enumerate(outputs_multi_pass):
+            outputs.append(
+                {
+                    "SC_score": np.mean([output_per_pass["SC_score"] for output_per_pass in outputs_per_prompt]),
+                    "PQ_score": np.mean([output_per_pass["PQ_score"] for output_per_pass in outputs_per_prompt]),
+                    "O_score": np.mean([output_per_pass["O_score"] for output_per_pass in outputs_per_prompt]),
+                    "SC_score_reasoning": outputs_per_prompt[0]["SC_score_reasoning"],
+                    "PQ_score_reasoning": outputs_per_prompt[0]["PQ_score_reasoning"],
+                    "SC_raw_output": outputs_per_prompt[0]["SC_raw_output"],
+                    "PQ_raw_output": outputs_per_prompt[0]["PQ_raw_output"],
+                }
+            )
+            if self.reduction == "average_first":
+                outputs[-1]["O_score"] = math.sqrt(outputs[-1]["SC_score"] * outputs[-1]["PQ_score"])
+        return outputs
@@ -0,0 +1,173 @@
+import json
+import re
+from typing import Dict, Any, List, Optional
+
+# ==============================================================================
+#  HELPER FUNCTIONS (Based on your provided robust fixers)
+# ==============================================================================
+# For clarity, these are named as internal functions (prefixed with an underscore).
+
+def _fix_json_quotes(s: str) -> str:
+    """First-stage repair: handle incorrect quotes and basic structure."""
+    # Replace Python-style booleans/None with JSON standard
+    s = re.sub(r'\bTrue\b', 'true', s)
+    s = re.sub(r'\bFalse\b', 'false', s)
+    s = re.sub(r'\bNone\b', 'null', s)
+    
+    # Attempt to replace single quotes with double quotes (a common VLM error)
+    # This is a high-risk operation that might break the reasoning content, 
+    # but it's worth trying early on.
+    try:
+        temp_s = s.replace("'", '"')
+        json.loads(temp_s)
+        return temp_s
+    except json.JSONDecodeError:
+        # If it's still invalid after replacement, return the original string for the next repair step.
+        pass
+
+    # Add double quotes to keys (e.g., {reasoning: ...} -> {"reasoning": ...})
+    s = re.sub(r'([\{\s,])(\w+)\s*:', r'\1"\2":', s)
+    return s
+
+def _repair_reasoning_field_robust(json_str: str) -> str:
+    """Second-stage repair: specifically fix unescaped double quotes inside the 'reasoning' field."""
+    pattern = re.compile(
+        r'("reasoning"\s*:\s*")'  # --- Group 1: "reasoning" : "
+        r'(.*?)'                  # --- Group 2: The content (non-greedy)
+        r'(?="\s*[,}])',          # --- Lookahead: find " followed by , or }
+        re.DOTALL
+    )
+
+    def replacer(match):
+        prefix = match.group(1)
+        content = match.group(2)
+        # In the content, replace all unescaped " with \"
+        fixed_content = content.replace('"', '\\"')
+        return prefix + fixed_content
+
+    return pattern.sub(replacer, json_str)
+
+def _fallback_extract_and_rebuild(input_str: str) -> str:
+    """Final fallback strategy: abandon repair, directly extract information, and rebuild a valid JSON."""
+    # 1. Extract reasoning
+    # Find all content between "reasoning": and ,"score":
+    reasoning_text = ""
+    reason_match = re.search(r'["\']reasoning["\']\s*:\s*["\']?(.*?)["\']?\s*,\s*["\']score["\']', input_str, re.DOTALL | re.IGNORECASE)
+    if reason_match:
+        reasoning_text = reason_match.group(1).strip()
+        # Clean up any potentially remaining escape characters
+        reasoning_text = reasoning_text.replace('\\"', '"')
+    else:
+        # If not found, assume all text besides the score part is the reasoning.
+        # First, remove the score part.
+        score_part_match = re.search(r'["\']score["\']\s*:.*', input_str, re.IGNORECASE)
+        if score_part_match:
+            reasoning_text = input_str[:score_part_match.start()].strip()
+        else:
+            # If even 'score' cannot be found, assume the entire string is the reasoning.
+            reasoning_text = input_str
+            
+    # 2. Extract scores
+    scores = []
+    # Prioritize searching after the 'score' keyword.
+    score_match = re.search(r'["\']score["\']\s*:\s*(.*)', input_str, re.DOTALL | re.IGNORECASE)
+    search_area = score_match.group(1) if score_match else input_str
+    
+    # Find all integers or floats.
+    numbers = re.findall(r'[-+]?\d*\.?\d+', search_area)
+    if numbers:
+        scores = [float(num) for num in numbers]
+
+    # 3. Rebuild into a standard dictionary and return a JSON string.
+    rebuilt_data = {
+        "reasoning": reasoning_text,
+        "score": scores
+    }
+    return json.dumps(rebuilt_data, ensure_ascii=False)
+
+
+def _format_and_validate_dict(data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """Validate and format the parsed dictionary to ensure it meets the final output standard."""
+    if not isinstance(data, dict):
+        return None
+
+    # Extract reasoning, tolerating case and spelling variations.
+    reasoning = ""
+    for key in ["reasoning", "reason", "rationale"]:
+        if key in data and isinstance(data[key], str):
+            reasoning = data[key]
+            break
+
+    # Extract score, and ensure it is a list of floats.
+    scores = []
+    if 'score' in data:
+        score_val = data['score']
+        if isinstance(score_val, list):
+            scores = [float(s) for s in score_val if isinstance(s, (int, float, str))]
+        elif isinstance(score_val, (int, float)):
+            scores = [float(score_val)]
+    
+    # If any field was found, consider it a success.
+    if reasoning or scores:
+        return {"score": scores, "reasoning": reasoning}
+    
+    return None
+
+# ==============================================================================
+#  MAIN PARSING FUNCTION
+# ==============================================================================
+
+def parse_vlm_output_to_dict(input_string: str) -> Dict[str, Any]:
+    """
+    A highly robust function to parse a VLM's output string into a dictionary
+    containing 'score' and 'reasoning'.
+
+    It uses a multi-stage repair pipeline, progressively degrading from standard
+    JSON parsing to a final information extraction fallback.
+    """
+    # --- 0. Preprocessing ---
+    if not input_string or not input_string.strip():
+        return {"score": [], "reasoning": "Input was empty."}
+    
+    # Find the substring enclosed by `{}`, which is often the core of the VLM output.
+    json_match = re.search(r'\{.*\}', input_string, re.DOTALL)
+    target_str = json_match.group(0) if json_match else input_string.strip()
+
+    # --- Repair Pipeline ---
+    # Apply fixers in order, attempting to parse after each one.
+    
+    fixer_pipeline = [
+        lambda s: s,                        # 1. Try the original string.
+        _fix_json_quotes,                   # 2. Fix basic quotes and keywords.
+        _repair_reasoning_field_robust,     # 3. Fix internal quotes in the reasoning field.
+    ]
+
+    for fixer in fixer_pipeline:
+        try:
+            fixed_str = fixer(target_str)
+            data = json.loads(fixed_str)
+            validated_data = _format_and_validate_dict(data)
+            if validated_data is not None:
+                return validated_data
+        except (json.JSONDecodeError, TypeError):
+            # If it fails, continue to the next fixer.
+            continue
+            
+    # --- Final Fallback Strategy ---
+    # If all repair and parsing attempts fail, activate the information extraction mode.
+    try:
+        fallback_str = _fallback_extract_and_rebuild(target_str)
+        # This function guarantees a valid JSON string, so we can load it directly.
+        data = json.loads(fallback_str)
+        # Still run it through the validator to standardize the format.
+        validated_data = _format_and_validate_dict(data)
+        if validated_data:
+            return validated_data
+    except Exception:
+        # If even the final fallback strategy fails, return an error message.
+        pass
+        
+    return {
+        "score": [],
+        "reasoning": f"Failed to parse after all strategies. Original output: '{input_string}'"
+    }
@@ -1,6 +1,7 @@
 from typing import Optional
 
 import os
+import hashlib
 import random
 import time
 import numpy as np
@@ -56,8 +57,13 @@ def __init__(
 
         if self.enable_lora:
             if cache_dir is None:
-                root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-                cache_dir = os.path.join(root_dir, "cache", f"{os.path.basename(vlm_model)}_merged_lora")
+                root_dir = torch.hub.get_dir() # default: ~/.cache/torch/hub
+
+                lora_filename = os.path.splitext(os.path.basename(lora_path))[0]
+                lora_hash = hashlib.md5(lora_path.encode()).hexdigest()[:8]
+                lora_identifier = f"{lora_filename}_{lora_hash}"
+
+                cache_dir = os.path.join(root_dir, "EditScore", f"{os.path.basename(vlm_model)}_merged_lora_{lora_identifier}")
 
             if not os.path.exists(cache_dir):
                 print(f"Merging LORA to {vlm_model} and saving to {cache_dir}", flush=True)
@@ -120,4 +126,17 @@ def inference(self, messages, seed: Optional[int] = None):
             instruction = output.outputs[0].text.strip()
             responses.append(instruction)
 
-        return responses[0]
+        return responses[0]
+
+
+    def batch_inference(self, messages, seed: Optional[int] = None):
+        seed = self.seed if seed is None else seed
+        sampling_params = SamplingParams(max_tokens=512, temperature=self.temperature, top_p=0.9, top_k=20, seed=seed)
+        outputs = self.model.generate(messages, sampling_params, use_tqdm=False)
+
+        responses = []
+        for output in outputs:
+            instruction = output.outputs[0].text.strip()
+            responses.append(instruction)
+
+        return responses