Crawl and parse content of web site

Contents

Output

Yahoo


Code examples


JavaScript

const request = require('request');
const cheerio = require('cheerio');

let target_url = "https://www.yahoo.com/";
request(
    { method: 'GET', uri: target_url, gzip: true}, 
    function(err, res, content) {  
        const $ = cheerio.load(content);
        console.log($('title').text());
    }
);

PHP

<?php
$target_url = "https://www.yahoo.com/";
$context = stream_context_create(array('http' => array(
    'method' => "GET",
    'header' => implode("\r\n", array('Accept-Encoding: gzip,deflate'))
)));
$content = file_get_contents($target_url, false, $context);
if (isGzipResponse($http_response_header)) {
    $content = gzdecode($content);
}
$matches = [];
if( preg_match('!<title>(.*?)</title>!is', $content, $matches) ) {
    print $matches[1]."\n";
}

function isGzipResponse($headers) {
    foreach($headers as $header) {
        if (stristr($header, 'content-encoding') and stristr($header, 'gzip')) {
            return true;
        }
    }
}

Perl

use HTML::TagParser;

my $target_url = "https://www.yahoo.com/";
my $html = HTML::TagParser->new($target_url);
my $title = $html->getElementsByTagName( "title" );
print $title->innerText()."\n" if ref $title;

Python

from pyquery import PyQuery

target_url = "https://www.yahoo.com/"
pq = PyQuery(url=target_url)
print(pq('title').text())

Ruby

require 'open-uri'
require 'nokogiri'

target_url = "https://www.yahoo.com/"
doc = Nokogiri.HTML(open(target_url))
doc.search('title').each do |elm|
    puts elm.content
end