@@ -1540,6 +1540,19 @@ var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');
15401540// the src attribute so the images are no longer lazy loaded.
15411541
15421542function convertLazyLoadedImages ( $ ) {
1543+ var extractSrcFromJSON = function extractSrcFromJSON ( str ) {
1544+ try {
1545+ var _JSON$parse = JSON . parse ( str ) ,
1546+ src = _JSON$parse . src ;
1547+
1548+ if ( typeof src === 'string' ) return src ;
1549+ } catch ( e ) {
1550+ return false ;
1551+ }
1552+
1553+ return false ;
1554+ } ;
1555+
15431556 $ ( 'img' ) . each ( function ( _ , img ) {
15441557 var attrs = getAttrs ( img ) ;
15451558
@@ -1549,7 +1562,14 @@ function convertLazyLoadedImages($) {
15491562 if ( attr !== 'srcset' && IS_LINK . test ( value ) && IS_SRCSET . test ( value ) ) {
15501563 $ ( img ) . attr ( 'srcset' , value ) ;
15511564 } else if ( attr !== 'src' && attr !== 'srcset' && IS_LINK . test ( value ) && IS_IMAGE . test ( value ) ) {
1552- $ ( img ) . attr ( 'src' , value ) ;
1565+ // Is the value a JSON object? If so, we should attempt to extract the image src from the data.
1566+ var existingSrc = extractSrcFromJSON ( value ) ;
1567+
1568+ if ( existingSrc ) {
1569+ $ ( img ) . attr ( 'src' , existingSrc ) ;
1570+ } else {
1571+ $ ( img ) . attr ( 'src' , value ) ;
1572+ }
15531573 }
15541574 } ) ;
15551575 } ) ;
@@ -2388,6 +2408,14 @@ var MediumExtractor = {
23882408 // Is there anything in the content you selected that needs transformed
23892409 // before it's consumable content? E.g., unusual lazy loaded images
23902410 transforms : {
2411+ // Allow drop cap character.
2412+ 'section span:first-of-type' : function sectionSpanFirstOfType ( $node ) {
2413+ var $text = $node . html ( ) ;
2414+
2415+ if ( $text . length === 1 && / ^ [ a - z A - Z ( ) ] + $ / . test ( $text ) ) {
2416+ $node . replaceWith ( $text ) ;
2417+ }
2418+ } ,
23912419 // Re-write lazy-loaded youtube videos
23922420 iframe : function iframe ( $node ) {
23932421 var ytRe = / h t t p s : \/ \/ i .e m b e d .l y \/ .+ u r l = h t t p s : \/ \/ i \. y t i m g \. c o m \/ v i \/ ( \w + ) \/ / ;
@@ -2429,7 +2457,7 @@ var MediumExtractor = {
24292457 // Is there anything that is in the result that shouldn't be?
24302458 // The clean selectors will remove anything that matches from
24312459 // the result
2432- clean : [ 'span' , 'svg' ]
2460+ clean : [ 'span a ' , 'svg' ]
24332461 } ,
24342462 date_published : {
24352463 selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
@@ -6411,10 +6439,14 @@ function cleanDatePublished(dateString) {
64116439 format = _ref . format ;
64126440
64136441 // If string is in milliseconds or seconds, convert to int and return
6414- if ( MS_DATE_STRING . test ( dateString ) || SEC_DATE_STRING . test ( dateString ) ) {
6442+ if ( MS_DATE_STRING . test ( dateString ) ) {
64156443 return new Date ( _parseInt ( dateString , 10 ) ) . toISOString ( ) ;
64166444 }
64176445
6446+ if ( SEC_DATE_STRING . test ( dateString ) ) {
6447+ return new Date ( _parseInt ( dateString , 10 ) * 1000 ) . toISOString ( ) ;
6448+ }
6449+
64186450 var date = createDate ( dateString , timezone , format ) ;
64196451
64206452 if ( ! date . isValid ( ) ) {
@@ -7546,13 +7578,26 @@ var GenericExcerptExtractor = {
75467578 }
75477579} ;
75487580
7581+ var getWordCount = function getWordCount ( content ) {
7582+ var $ = cheerio . load ( content ) ;
7583+ var $content = $ ( 'div' ) . first ( ) ;
7584+ var text = normalizeSpaces ( $content . text ( ) ) ;
7585+ return text . split ( / \s / ) . length ;
7586+ } ;
7587+
7588+ var getWordCountAlt = function getWordCountAlt ( content ) {
7589+ content = content . replace ( / < [ ^ > ] * > / g, ' ' ) ;
7590+ content = content . replace ( / \s + / g, ' ' ) ;
7591+ content = content . trim ( ) ;
7592+ return content . split ( ' ' ) . length ;
7593+ } ;
7594+
75497595var GenericWordCountExtractor = {
75507596 extract : function extract ( _ref ) {
75517597 var content = _ref . content ;
7552- var $ = cheerio . load ( content ) ;
7553- var $content = $ ( 'div' ) . first ( ) ;
7554- var text = normalizeSpaces ( $content . text ( ) ) ;
7555- return text . split ( / \s / ) . length ;
7598+ var count = getWordCount ( content ) ;
7599+ if ( count === 1 ) count = getWordCountAlt ( content ) ;
7600+ return count ;
75567601 }
75577602} ;
75587603
@@ -7715,7 +7760,8 @@ function select(opts) {
77157760 _extractionOpts$defau = extractionOpts . defaultCleaner ,
77167761 defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau ,
77177762 allowMultiple = extractionOpts . allowMultiple ;
7718- var matchingSelector = findMatchingSelector ( $ , selectors , extractHtml , allowMultiple ) ;
7763+ var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple ;
7764+ var matchingSelector = findMatchingSelector ( $ , selectors , extractHtml , overrideAllowMultiple ) ;
77197765 if ( ! matchingSelector ) return null ;
77207766
77217767 function transformAndClean ( $node ) {
@@ -7988,7 +8034,7 @@ function _collectAllPages() {
79888034 } ) ;
79898035 return _context . abrupt ( "return" , _objectSpread ( { } , result , {
79908036 total_pages : pages ,
7991- pages_rendered : pages ,
8037+ rendered_pages : pages ,
79928038 word_count : word_count
79938039 } ) ) ;
79948040
0 commit comments