@ -3,6 +3,19 @@
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				class  LinkDetailsExtractor 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  include  ActionView :: Helpers :: TagHelper 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  # Some publications wrap their JSON-LD data in their <script> tags 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  # in commented-out CDATA blocks, they need to be removed before 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  # attempting to parse JSON 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  CDATA_JUNK_PATTERN  =  %r{ ^[ \ s]*( 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    ( / \ *[ \ s]*<! \ [CDATA \ [[ \ s]* \ * / )  # Block comment style opening 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    | 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    ( / / [ \ s ] * < ! \ [ CDATA \ [ )  # Single-line comment style opening 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    | 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    ( / \ *[ \ s]* \ ] \ ]>[ \ s]* \ * / )  # Block comment style closing 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    | 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    ( / / [ \ s ] * \ ] \ ] > )  # Single-line comment style closing 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  ) [ \ s ] * $ } x 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  class  StructuredData 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    SUPPORTED_TYPES  =  %w( 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      NewsArticle 
 
			
		 
		
	
	
		
			
				
					
						
							
								 
							 
						
						
							
								 
							 
						
						
					 
				
			
			 
			 
			
				@ -61,6 +74,10 @@ class LinkDetailsExtractor
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      publisher . dig ( 'logo' ,  'url' ) 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    def  valid? 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      json . present? 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    private 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    def  author 
 
			
		 
		
	
	
		
			
				
					
						
							
								 
							 
						
						
							
								 
							 
						
						
					 
				
			
			 
			 
			
				@ -134,11 +151,11 @@ class LinkDetailsExtractor
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  def  title 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				     structured_data& . headline  ||  opengraph_tag ( 'og:title' )  ||  document . xpath ( '//title' ) . map ( & :content ) . first 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    html_entities. decode (  structured_data& . headline  ||  opengraph_tag ( 'og:title' )  ||  document . xpath ( '//title' ) . map ( & :content ) . first ) 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  def  description 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				     structured_data& . description  ||  opengraph_tag ( 'og:description' )  ||  meta_tag ( 'description' ) 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    html_entities. decode (  structured_data& . description  ||  opengraph_tag ( 'og:description' )  ||  meta_tag ( 'description' ) ) 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  def  image 
 
			
		 
		
	
	
		
			
				
					
						
						
						
							
								 
							 
						
					 
				
			
			 
			 
			
				@ -146,11 +163,11 @@ class LinkDetailsExtractor
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  def  canonical_url 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    valid_url_or_nil ( opengraph_tag( 'og:url' )  ||  link_tag ( 'canonica  l') ,  same_origin_only :  true )  ||  @original_url . to_s 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    valid_url_or_nil ( link_tag( 'canonical' )  ||  opengraph_tag ( 'og:ur  l') ,  same_origin_only :  true )  ||  @original_url . to_s 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  def  provider_name 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				     structured_data& . publisher_name  ||  opengraph_tag ( 'og:site_name' ) 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    html_entities. decode (  structured_data& . publisher_name  ||  opengraph_tag ( 'og:site_name' ) ) 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  def  provider_url 
 
			
		 
		
	
	
		
			
				
					
						
						
						
							
								 
							 
						
					 
				
			
			 
			 
			
				@ -158,7 +175,7 @@ class LinkDetailsExtractor
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  def  author_name 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				     structured_data& . author_name  ||  opengraph_tag ( 'og:author' )  ||  opengraph_tag ( 'og:author:username' ) 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    html_entities. decode (  structured_data& . author_name  ||  opengraph_tag ( 'og:author' )  ||  opengraph_tag ( 'og:author:username' ) ) 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  def  author_url 
 
			
		 
		
	
	
		
			
				
					
						
							
								 
							 
						
						
							
								 
							 
						
						
					 
				
			
			 
			 
			
				@ -223,10 +240,24 @@ class LinkDetailsExtractor
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  def  structured_data 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    @structured_data  || =  begin 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      json_ld  =  document . xpath ( '//script[@type="application/ld+json"]' ) . map ( & :content ) . first 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      json_ld . present?  ?  StructuredData . new ( json_ld )  :  nil 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    rescue  Oj :: ParseError 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      nil 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      # Some publications have more than one JSON-LD definition on the page, 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      # and some of those definitions aren't valid JSON either, so we have 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      # to loop through here until we find something that is the right type 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      # and doesn't break 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      document . xpath ( '//script[@type="application/ld+json"]' ) . filter_map  do  | element | 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				        json_ld  =  element . content & . gsub ( CDATA_JUNK_PATTERN ,  '' ) 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				        next  if  json_ld . blank? 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				        structured_data  =  StructuredData . new ( html_entities . decode ( json_ld ) ) 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				        next  unless  structured_data . valid? 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				        structured_data 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      rescue  Oj :: ParseError ,  EncodingError 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				        Rails . logger . debug ( " Invalid JSON-LD in  #{ @original_url } " ) 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				        next 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      end . first 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
	
		
			
				
					
						
						
						
							
								 
							 
						
					 
				
			
			 
			 
			
				@ -246,4 +277,8 @@ class LinkDetailsExtractor
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				      detector . strip_tags  =  true 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  def  html_entities 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				    @html_entities  || =  HTMLEntities . new 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				  end 
 
			
		 
		
	
		
			
				 
				 
			
			 
			 
			
				end