Question

BTW... This website only works with Internet Explorer....

I am attempting to scrap a website for a client so that I can automate a task for them. Basically it will scrape for different reports and look for the turn around times and email them to the client. My scrapping program works fine, the problem I am having is logging into the website with Mechanize since the login form is using AJAX. I have look around for a solution but can't seem to find exactly what i'm looking for.

Below is the HTML form and (from what i can tell) the AJAX that handles it.

function TranLogin() { var url = 'login.aspx?isAjax=true&eventTarget=TranLogin'; var postData = Form.serialize('Form1'); ajaxRequest = new Ajax.Request( url, { method : 'post', postBody : postData, onComplete : TransLoginFinished, onFailure : reportError, onException : reportException }); }

function TransLoginFinished(serverResponse) { if (requestFailed) return; xmlNodes = serverResponse.responseXML; usrSite = "8000"; usrCode = decodeXmlChar(xmlNodes.getElementsByTagName('UserCode')[0].text); if (xmlNodes.getElementsByTagName('LoginResult')[0].text == '-1'){ alert(decodeXmlChar(xmlNodes.getElementsByTagName('FailMsg')[0].text)); Form.enable('Form1'); return; } if (xmlNodes.getElementsByTagName('LoginResult')[0].text == '20'){ window.location.replace('initpasswd.aspx?usersite=' + usrSite + '&usercode=' + usrCode); return; } if (xmlNodes.getElementsByTagName('LoginResult')[0].text == '14'){ window.location.replace('chgpasswd.aspx?type=chgpwd&usersite=' + usrSite + '&usercode=' + usrCode); return; } if (xmlNodes.getElementsByTagName('LoginResult')[0].text == '16'){ window.location.replace('chgpasswd.aspx?type=pwdexpire&usersite=' + usrSite + '&usercode=' + usrCode); return; } if (xmlNodes.getElementsByTagName('LoginResult')[0].text == '0'){ if (xmlNodes.getElementsByTagName('PwdExpireWarning')[0].text == 'true'){ var changePwdNow = window.confirm(decodeXmlChar(xmlNodes.getElementsByTagName('PwdExpireMsg')[0].text)); if (changePwdNow == true){ window.location.replace('chgpasswd.aspx?type=chgpwd&usersite=' + usrSite + '&usercode=' + usrCode); return; } // var arg = { promptMsg :decodeXmlChar(xmlNodes.getElementsByTagName('PwdExpireMsg')[0].text), // buttons : [ { value : "Yes", rtnVal : 1 }, // { value : "No", rtnVal : 0 } // ] // }; // var rtn = window.showModalDialog('../Modules/ModalMessageBox.aspx',arg, "dialogHeight:140px;dialogWidth:500px; center:1;status:no;"); // if (rtn && rtn == 1){ // window.location.replace('chgpasswd.aspx?type=chgpwd&usersite=' + usrSite + '&usercode=' + usrCode); // return; // }
} if (JTrim($('txtHospCode').value) == '') { hospList = decodeXmlChar(xmlNodes.getElementsByTagName('HospList')[0].text).split('|'); if (hospList.length < 2) { selectedHospCode = hospList[0].split('-')[0]; TranSelectHosp(selectedHospCode); return; } $('divHospList').style.display = 'block';

                for(i=0;i<hospList.length;i++)
                {
                    if (hospList[i] != '')
                    {
                        divHospCode = document.createElement("div");
                        divHospCode.className='divHospCode';
                        $('divHospListBG').appendChild(divHospCode);
                        lnkHospCode = document.createElement("a");
                        if (hospList[i].length <= 33)
                            lnkHospCode.innerText = hospList[i];
                        else
                            lnkHospCode.innerText = hospList[i].substr(0,30) + '...';
                        lnkHospCode.title = hospList[i];
                        lnkHospCode.className = 'lnkHospCode';
                        divHospCode.appendChild(lnkHospCode);
                        lnkHospCode.onmouseover = function(){this.style.color = '#000000';}
                        lnkHospCode.onmouseout = function(){this.style.color = '#6c6c6c';}
                        lnkHospCode.onclick = function(){TranSelectHosp(this.innerText.split('-')[0]);}
                        if (i > 7 && $('divHospListBG').style.overflow != 'auto')
                        {
                            $('divHospListBG').style.height = '198px';
                            $('divHospListBG').style.overflow = 'auto';
                        }
                    }
                }
                return;
            }
            else
            {
                TranSelectHosp(JTrim($('txtHospCode').value));
            }
        }
     }

<form name="Form1" method="post" action="login.aspx" id="Form1">
Input Account Code:
            <div class="divRight">
                <input name="txtHospCode" type="text" id="txtHospCode" class="inputClass" maxlength="4" />
            </div>
            <div class="divLeft">
                <span>Input User Code:</span>
            </div>
            <div class="divRight">
                <input name="txtUserCode" type="text" id="txtUserCode" class="inputClass" maxlength="6" />
            </div>
            <div class="divLeft">
                <span>Input Password:</span></div>
            <div class="divRight">
                <input name="txtPassword" type="password" id="txtPassword" class="inputClass" />
            </div>
            <div class="divLeft">
                <span>Login As:</span>
            </div>
            <div class="divRight">
                &nbsp;<input type="radio" name="rdLoginType" value="D" checked="checked" />Doctor&nbsp;&nbsp;
                <input type="radio" name="rdLoginType" value="T" />Other
            </div>
            <div class="divLeft">
            </div>
            <div class="divRight">
                <input class="buttonClass" id="btnOK" type="button" value="Enter" onclick="LoginIn();" />
                <input class="buttonClass" id="btnReset" type="button" value="Reset" onclick="ResetInput();" />
            </div>

My code so far


import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
import re

Instatiate Browser

br = mechanize.Browser()

Cookie Jar

cj = cookielib.LWPCookieJar() br.set_cookiejar(cj)

Browser options

br.set_handle_equiv(True)

br.set_handle_gzip(True)

br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False)

Follows refresh 0 but not hangs on refresh > 0

br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

User-Agent

br.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)')]

def login_to_website(login_url, login_form_name, usr_form_name, pwd_form_name,acct_code_name, usr, pwd, acct_code): """ Logs user into website """

# Open the url of the login page
br.open(login_url)


# Select the login form name
br.select_form(login_form_name)

# Enter user's credentials into the form
br.form[acct_code_name] = acct_code
br.form[usr_form_name] = usr
br.form[pwd_form_name] = pwd
br.find_control(name='rdLoginType').value = ['T']

# Submit the form
print "Logging in as:", usr 
br.submit()

# print current url
print "We are now at:", br.geturl()

# print error
if br.geturl() == login_url:
    print "Login Failed"
else: print "Successfully logged in"

login_to_website('https://www.website.com', 'Form1', 'txtUserCode', 'txtPassword','txtHospCode', usr, pwd, acctCode)

Was it helpful?

Solution

Mechanize doesn't, as far as I know, process Javascript. So your options are, in rough order of what I'd try:

  • Turn Javascript off in your browser, and see if you can still log into the site. If so, attempt to use mechanize with that process.
  • Try to work out what the effect of the AJAX form is (both server side and client side), and attempt to emulate it using Python. If you've not already found it, something like Firebug is invaluable for this.
  • Use one of the various libraries allowing Python to take control of a real browser. I've never done this, but I know there are wrappers for at least Firefox and IE.
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top