@@ -680,37 +680,52 @@ def _process_energyml_object_metadata(self, zf: zipfile.ZipFile, override: Overr
680680 file_path = override .part_name .lstrip ("/" )
681681 content_type = override .content_type
682682
683+ uuid , version , title , last_changed = None , None , None , None
684+
683685 try :
684686 # First try to extract UUID and version from file path (works for EXPANDED mode)
685687 uuid , version = extract_uuid_and_version_from_obj_path (file_path )
686688
687689 # For CLASSIC mode, version is not in the path, so we need to extract it from XML content
688- if uuid and version is None :
689- try :
690- # Read first chunk of XML to extract version without full parsing
691- with zf .open (file_path ) as f :
692- chunk = f .read (2048 ) # 2KB should be enough for root element
693- self .stats .bytes_read += len (chunk )
694- chunk_str = chunk .decode ("utf-8" , errors = "ignore" )
695-
696- # Extract version if present
697- version_patterns = [
698- r'object[Vv]ersion["\']?\s*[:=]\s*["\']([^"\']+)' ,
699- ]
690+ # => Finally I do it anyway to get the title.
691+ try :
692+ # Read first chunk of XML to extract version, title, and last_changed in one regex search
693+ # with self.zip_accessor.get_zip_file() as f:
694+ chunk = zf .read (4096 ) # 4KB to increase chance of catching citation block
695+ self .stats .bytes_read += len (chunk )
696+ chunk_str = chunk .decode ("utf-8" , errors = "ignore" )
697+
698+ # Single regex with named groups for version, title, and last_changed
699+ pattern = re .compile (
700+ r'object[Vv]ersion["\']?\s*[:=]\s*["\'](?P<version>[^"\']+)' # version attribute
701+ r"|<eml:Title>(?P<title>.*?)</eml:Title>" # eml:Title tag
702+ r"|<eml:LastUpdate>(?P<last_changed>.*?)</eml:LastUpdate>" ,
703+ re .DOTALL ,
704+ )
700705
701- for pattern in version_patterns :
702- version_match = re .search (pattern , chunk_str )
703- if version_match :
704- version = version_match .group (1 )
705- if not isinstance (version , str ):
706- version = str (version )
707- break
708- except Exception as e :
709- logging .debug (f"Failed to extract version from XML content for { file_path } : { e } " )
706+ # Iterate all matches and assign the first found for each group
707+ found = {"version" : None , "title" : None , "last_changed" : None }
708+ for match in pattern .finditer (chunk_str ):
709+ for key in found :
710+ if found [key ] is None and match .group (key ) is not None :
711+ found [key ] = match .group (key ).strip ()
712+ if version is None and found ["version" ] is not None :
713+ version = found ["version" ]
714+ if found ["title" ] is not None :
715+ title = found ["title" ]
716+ if found ["last_changed" ] is not None :
717+ last_changed = found ["last_changed" ]
718+ # Try to parse as datetime if possible
719+ try :
720+ last_changed = date_to_datetime (last_changed )
721+ except Exception :
722+ pass
723+ except Exception as e :
724+ logging .debug (f"Failed to extract version/title/last_update from XML content for { file_path } : { e } " )
710725
711726 if uuid : # Only process if we successfully extracted UUID
712727 uri = create_uri_from_content_type_or_qualified_type (ct_or_qt = content_type , uuid = uuid , version = version )
713- metadata = EpcObjectMetadata (uri = uri )
728+ metadata = EpcObjectMetadata (uri = uri , title = title , last_changed = last_changed )
714729
715730 # Store in indexes
716731 identifier = metadata .identifier
@@ -2055,6 +2070,44 @@ def list_objects(
20552070 ) -> List [ResourceMetadata ]:
20562071 return [m .to_resource_metadata () for m in self ._metadata_mgr .list_metadata (qualified_type_filter = object_type )]
20572072
2073+ def list_objects_parallel (
2074+ self , dataspace : Optional [str ] = None , object_type : Optional [str ] = None
2075+ ) -> List [ResourceMetadata ]:
2076+ # use self._metadata_mgr.list_metadata(qualified_type_filter=object_type) to get the list of metadata,
2077+ # then get_each object in parallel using get_object and return the list of ResourceMetadata for the objects that were successfully retrieved
2078+
2079+ import concurrent .futures
2080+ from concurrent .futures import ThreadPoolExecutor , as_completed
2081+
2082+ metadata_list = self ._metadata_mgr .list_metadata (qualified_type_filter = object_type )
2083+ with ThreadPoolExecutor (max_workers = 8 ) as executor :
2084+ future_to_metadata = {executor .submit (self .get_object , m .identifier ): m for m in metadata_list }
2085+ resource_metadata_list = []
2086+ for future in as_completed (future_to_metadata ):
2087+ metadata = future_to_metadata [future ]
2088+ try :
2089+ obj = future .result ()
2090+ if obj is not None :
2091+ resource_metadata_list .append (metadata .to_resource_metadata ())
2092+ except Exception as e :
2093+ logging .debug (f"Failed to get object for metadata { metadata .identifier } : { e } " )
2094+ return resource_metadata_list
2095+
2096+ def list_objects_seq (
2097+ self , dataspace : Optional [str ] = None , object_type : Optional [str ] = None
2098+ ) -> List [ResourceMetadata ]:
2099+ metadata_list = self ._metadata_mgr .list_metadata (qualified_type_filter = object_type )
2100+ resource_metadata_list = []
2101+ for metadata in metadata_list :
2102+ try :
2103+ obj = self .get_object (metadata .identifier )
2104+ if obj is not None :
2105+ resource_metadata_list .append (metadata .to_resource_metadata ())
2106+ except Exception as e :
2107+ logging .debug (f"Failed to get object for metadata { metadata .identifier } : { e } " )
2108+
2109+ return resource_metadata_list
2110+
20582111 def get_obj_rels (self , obj : Union [str , Uri , Any ]) -> List [Relationship ]:
20592112 _id = self ._id_from_uri_or_identifier (obj )
20602113
0 commit comments