@@ -501,6 +501,8 @@ def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str:
501501 def match_format (s ):
502502 return "{}\n {}\n " .format (s , len (s ) * '-' )
503503 s = inspect .getdoc (model )
504+ if s is None :
505+ return ''
504506 if len (s ) <= char_lim :
505507 # if the fetched docstring is smaller than char_lim, no trimming required
506508 return s .strip ()
@@ -528,6 +530,105 @@ def match_format(s):
528530 s = "{}..." .format (s [:char_lim - 3 ])
529531 return s .strip ()
530532
533+ def _extract_sklearn_parameter_docstring (self , model ) -> Union [None , str ]:
534+ '''Extracts the part of sklearn docstring containing parameter information
535+
536+ Fetches the entire docstring and trims just the Parameter section.
537+ The assumption is that 'Parameters' is the first section in sklearn docstrings,
538+ followed by other sections titled 'Attributes', 'See also', 'Note', 'References',
539+ appearing in that order if defined.
540+ Returns a None if no section with 'Parameters' can be found in the docstring.
541+
542+ Parameters
543+ ----------
544+ model : sklearn model
545+
546+ Returns
547+ -------
548+ str, or None
549+ '''
550+ def match_format (s ):
551+ return "{}\n {}\n " .format (s , len (s ) * '-' )
552+ s = inspect .getdoc (model )
553+ if s is None :
554+ return None
555+ try :
556+ index1 = s .index (match_format ("Parameters" ))
557+ except ValueError as e :
558+ # when sklearn docstring has no 'Parameters' section
559+ print ("{} {}" .format (match_format ("Parameters" ), e ))
560+ return None
561+
562+ headings = ["Attributes" , "Notes" , "See also" , "Note" , "References" ]
563+ for h in headings :
564+ try :
565+ # to find end of Parameters section
566+ index2 = s .index (match_format (h ))
567+ break
568+ except ValueError :
569+ print ("{} not available in docstring" .format (h ))
570+ continue
571+ else :
572+ # in the case only 'Parameters' exist, trim till end of docstring
573+ index2 = len (s )
574+ s = s [index1 :index2 ]
575+ return s .strip ()
576+
577+ def _extract_sklearn_param_info (self , model , char_lim = 1024 ) -> Union [None , Dict ]:
578+ '''Parses parameter type and description from sklearn dosctring
579+
580+ Parameters
581+ ----------
582+ model : sklearn model
583+ char_lim : int
584+ Specifying the max length of the returned string.
585+ OpenML servers have a constraint of 1024 characters string fields.
586+
587+ Returns
588+ -------
589+ Dict, or None
590+ '''
591+ docstring = self ._extract_sklearn_parameter_docstring (model )
592+ if docstring is None :
593+ # when sklearn docstring has no 'Parameters' section
594+ return None
595+
596+ n = re .compile ("[.]*\n " , flags = IGNORECASE )
597+ lines = n .split (docstring )
598+ p = re .compile ("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*" , flags = IGNORECASE )
599+ parameter_docs = OrderedDict () # type: Dict
600+ description = [] # type: List
601+
602+ # collecting parameters and their descriptions
603+ for i , s in enumerate (lines ):
604+ param = p .findall (s )
605+ if param != []:
606+ if len (description ) > 0 :
607+ description [- 1 ] = '\n ' .join (description [- 1 ]).strip ()
608+ if len (description [- 1 ]) > char_lim :
609+ description [- 1 ] = "{}..." .format (description [- 1 ][:char_lim - 3 ])
610+ description .append ([])
611+ else :
612+ if len (description ) > 0 :
613+ description [- 1 ].append (s )
614+ description [- 1 ] = '\n ' .join (description [- 1 ]).strip ()
615+ if len (description [- 1 ]) > char_lim :
616+ description [- 1 ] = "{}..." .format (description [- 1 ][:char_lim - 3 ])
617+
618+ # collecting parameters and their types
619+ matches = p .findall (docstring )
620+ for i , param in enumerate (matches ):
621+ key , value = param .split (':' )
622+ parameter_docs [key .strip ()] = [value .strip (), description [i ]]
623+
624+ # to avoid KeyError for missing parameters
625+ param_list_true = list (model .get_params ().keys ())
626+ param_list_found = list (parameter_docs .keys ())
627+ for param in list (set (param_list_true ) - set (param_list_found )):
628+ parameter_docs [param ] = [None , None ]
629+
630+ return parameter_docs
631+
531632 def _serialize_model (self , model : Any ) -> OpenMLFlow :
532633 """Create an OpenMLFlow.
533634
@@ -656,97 +757,6 @@ def _check_multiple_occurence_of_component_in_flow(
656757 known_sub_components .add (visitee .name )
657758 to_visit_stack .extend (visitee .components .values ())
658759
659- def _extract_sklearn_parameter_docstring (self , model ) -> Union [None , str ]:
660- '''Extracts the part of sklearn docstring containing parameter information
661-
662- Fetches the entire docstring and trims just the Parameter section.
663- The assumption is that 'Parameters' is the first section in sklearn docstrings,
664- followed by other sections titled 'Attributes', 'See also', 'Note', 'References',
665- appearing in that order if defined.
666- Returns a None if no section with 'Parameters' can be found in the docstring.
667-
668- Parameters
669- ----------
670- model : sklearn model
671-
672- Returns
673- -------
674- str, or None
675- '''
676- def match_format (s ):
677- return "{}\n {}\n " .format (s , len (s ) * '-' )
678- s = inspect .getdoc (model )
679- try :
680- index1 = s .index (match_format ("Parameters" ))
681- except ValueError as e :
682- # when sklearn docstring has no 'Parameters' section
683- print ("{} {}" .format (match_format ("Parameters" ), e ))
684- return None
685-
686- headings = ["Attributes" , "Notes" , "See also" , "Note" , "References" ]
687- for h in headings :
688- try :
689- # to find end of Parameters section
690- index2 = s .index (match_format (h ))
691- break
692- except ValueError :
693- print ("{} not available in docstring" .format (h ))
694- continue
695- else :
696- # in the case only 'Parameters' exist, trim till end of docstring
697- index2 = len (s )
698- s = s [index1 :index2 ]
699- return s .strip ()
700-
701- def _extract_sklearn_param_info (self , model , char_lim = 1024 ) -> Union [None , Dict ]:
702- '''Parses parameter type and description from sklearn dosctring
703-
704- Parameters
705- ----------
706- model : sklearn model
707- char_lim : int
708- Specifying the max length of the returned string.
709- OpenML servers have a constraint of 1024 characters string fields.
710-
711- Returns
712- -------
713- Dict, or None
714- '''
715- docstring = self ._extract_sklearn_parameter_docstring (model )
716- if docstring is None :
717- # when sklearn docstring has no 'Parameters' section
718- return None
719-
720- n = re .compile ("[.]*\n " , flags = IGNORECASE )
721- lines = n .split (docstring )
722- p = re .compile ("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*" , flags = IGNORECASE )
723- parameter_docs = OrderedDict () # type: Dict
724- description = [] # type: List
725-
726- # collecting parameters and their descriptions
727- for i , s in enumerate (lines ):
728- param = p .findall (s )
729- if param != []:
730- if len (description ) > 0 :
731- description [- 1 ] = '\n ' .join (description [- 1 ]).strip ()
732- if len (description [- 1 ]) > char_lim :
733- description [- 1 ] = "{}..." .format (description [- 1 ][:char_lim - 3 ])
734- description .append ([])
735- else :
736- if len (description ) > 0 :
737- description [- 1 ].append (s )
738- description [- 1 ] = '\n ' .join (description [- 1 ]).strip ()
739- if len (description [- 1 ]) > char_lim :
740- description [- 1 ] = "{}..." .format (description [- 1 ][:char_lim - 3 ])
741-
742- # collecting parameters and their types
743- matches = p .findall (docstring )
744- for i , param in enumerate (matches ):
745- key , value = param .split (':' )
746- parameter_docs [key .strip ()] = [value .strip (), description [i ]]
747-
748- return parameter_docs
749-
750760 def _extract_information_from_model (
751761 self ,
752762 model : Any ,
@@ -890,6 +900,10 @@ def flatten_all(list_):
890900 parameters [k ] = None
891901
892902 if parameters_docs is not None :
903+ # print(type(model))
904+ # print(sorted(parameters_docs.keys()))
905+ # print(sorted(model_parameters.keys()))
906+ # print()
893907 data_type , description = parameters_docs [k ]
894908 parameters_meta_info [k ] = OrderedDict ((('description' , description ),
895909 ('data_type' , data_type )))
0 commit comments