Scraping a dynamic page with cookies

https://stackoverflow.com/questions/21359310

02-10-2022
|

Question

I am trying to scrape this page for a set of zipcodes. https://www.chase.com/mortgage/loan-officer/search-results.html#action-search;zipcode-11747;lastname-;language-

If you put that in your browser, you will get results however, trying to do so in code fails.

First I tried

HttpWebRequest  request = (HttpWebRequest )System.Net.WebRequest.Create(URI);
var sr = new System.IO.StreamReader(resp.GetResponseStream());
string page= sr.ReadToEnd().Trim();

but this code generated by a plugin in fiddler didnt work as well either. no results are returned. So what exactly am I missing??

private void MakeRequests()
{
    HttpWebResponse response;
    string responseText;

    if (Request_www_chase_com(out response))
    {
        responseText = ReadResponse(response);

        response.Close();
    }
}

private static string ReadResponse(HttpWebResponse response)
{
    using (Stream responseStream = response.GetResponseStream())
    {
        Stream streamToRead = responseStream;
        if (response.ContentEncoding.ToLower().Contains("gzip"))
        {
            streamToRead = new GZipStream(streamToRead, CompressionMode.Decompress);
        }
        else if (response.ContentEncoding.ToLower().Contains("deflate"))
        {
            streamToRead = new DeflateStream(streamToRead, CompressionMode.Decompress);
        }

        using (StreamReader streamReader = new StreamReader(streamToRead, Encoding.UTF8))
        {
            return streamReader.ReadToEnd();
        }
    }
}

private bool Request_www_chase_com(out HttpWebResponse response)
{
    response = null;

    try
    {
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create("https://www.chase.com/mortgage/loan-officer/search-results.html");

        request.KeepAlive = true;
        request.Headers.Set(HttpRequestHeader.CacheControl, "max-age=0");
        request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
        request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36";
        request.Headers.Add("DNT", @"1");
        request.Referer = "https://mail.google.com/mail/u/0/?shva=1";
        request.Headers.Set(HttpRequestHeader.AcceptEncoding, "gzip,deflate,sdch");
        request.Headers.Set(HttpRequestHeader.AcceptLanguage, "en-US,en;q=0.8");
        request.Headers.Set(HttpRequestHeader.Cookie, @"v1st=3B46E5CCD302C2DE; marketlist=68|90|152|170|198; chasezip=zipcode=11577&county=Nassau&state=NY; ASP.NET_SessionId=kwybehscfioasswbl20wb14f; PC_1_0=n%3Dundefined|u%3Dundefined|l%3Dundefined|zip%3D11577|lastUpdate%3D2014-01-24|lastSent%3D2014-01-24|home%3Dpersonal|; SessionPersistence=CLICKSTREAMCLOUD%3A%3DvisitorId%3D%7CPROFILEDATA%3A%3D%7CSURFERINFO%3A%3Dbrowser%3DChrome%2COS%3DWindows%2Cresolution%3D1366x768%7C; fsr.s=%7B%22v2%22%3A-2%2C%22v1%22%3A1%2C%22rid%22%3A%22d464cf6-82273859-c860-572f-2944b%22%2C%22to%22%3A5%2C%22c%22%3A%22https%3A%2F%2Fwww.chase.com%2Fmortgage%2Floan-officer%2Fsearch-results.html%23action-search%3Bzipcode-11747%3Blastname-%3Blanguage-%22%2C%22pv%22%3A12%2C%22lc%22%3A%7B%22d18%22%3A%7B%22v%22%3A12%2C%22s%22%3Atrue%7D%7D%2C%22cd%22%3A18%2C%22sd%22%3A18%2C%22f%22%3A1390649574789%7D");
        request.IfModifiedSince = DateTime.Parse("Fri, 24 Jan 2014 20:18:51 GMT");

        response = (HttpWebResponse)request.GetResponse();
    }
    catch (WebException e)
    {
        if (e.Status == WebExceptionStatus.ProtocolError) response = (HttpWebResponse)e.Response;
        else return false;
    }
    catch (Exception)
    {
        if (response != null) response.Close();
        return false;
    }

    return true;
}

Solution

To make this work, you'd need to parse the HTML, then download and run the JavaScript. Instead of writing your own browser, use a Web Browser control to load the page, then scrape its inner HTML.

OTHER TIPS

The page uses AJAX to create the results so all you will see in your response is the initial HTML

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow