Aplikacje WWW

Marcin Benke

20 marca 2015

virtualenv

virtualenv dir
cd dir
source bin/activate
pip install beautifulsoup beautifulsoup4 django django-debug-toolbar

urllib

import urllib

response = urllib.urlopen('http://example.com')
pageText = response.read()
response.close()
print pageText

urlencode, quote

>>> urllib.urlencode((('k1','v1'),('k2','v2')))
'k1=v1&k2=v2'
>>> urllib.quote_plus('<ala ma kota&psa>')
'%3Cala+ma+kota%26psa%3E'

urllib2

Nagłówki, autentykacja, ciasteczka,...

def urlopen(url):
    request = urllib2.Request(url)
    request.add_header('User-Agent', USER_AGENT)
    request.add_header('Accept-encoding', 'gzip')
    request.add_header('Connection', 'close')
    response = urllib2.urlopen(request)
    return response

fetcher

    try:
      stoperFetch.start()
      response = urlopen(url)
      if response.headers.get('content-encoding', ) == 'gzip':
          compressed = response.read()
          pageText = gzip.GzipFile(fileobj=StringIO(compressed)).read()
          response.close()
      else:
          pageText = response.read()
          response.close()
      return pageText
    except urllib2.HTTPError, e:
        if e.getcode() == 503:
            warn('HTTPError: 503')
            warn(unicode(e.info()))
            print e.read()
            raise HTTPForbidden
        else:
            warn('HTTPError')

Ćwiczenia

Beautifulsoup

Beautiful Soup: We called him Tortoise because he taught us.

Beautiful Soup: We called him Tortoise because he taught us.

``You didn't write that awful page. You're just trying to get some data out of it. Beautiful Soup is here to help. Since 2004, it's been saving programmers hours or days of work on quick-turnaround screen scraping projects.''

Alice

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were
three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html)

Nawigacja

soup.title
# <title>The Dormouse's story</title>

soup.title.name
# u'title'

soup.title.string
# u'The Dormouse's story'

soup.title.parent.name
# u'head'

soup.p
# <p class="title"><b>The Dormouse's story</b></p>

soup.p['class']
# u'title'

soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

Wyszukiwanie

soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

Jeszcze przykłady

[a.find(class_="link1" for a in soup.find_all(class_='col5')]
  RE_PA = re.compile('^pa\d+')
  soup = BeautifulSoup(html)
  nodes = soup.findAll('a',id=RE_PA)
  for node in contents:
    if isinstance(node,NavigableString):
      ad['lines'].append(node)
    elif isinstance(node,Tag):
      if node.name == 'span':
        ad['displayurl']=concatText(node.contents)

Ćwiczenia