#!/usr/bin/python

# Author
# Krhis (http://krhis.net)
#
# License
# GNU General Public License version 3 only
#
# Usage
# The following will compile a list of images tagged with 'maid':
# $ python gelbooru.py maid
#
# What if you're only interested in maids eating ice cream?
# $ python gelbooru.py ice_cream maid
#
# Description
# Ok, first of all the board admins out there are going to hate me for this.
# With that out of the way; the purpose of this script is to spider
# gelbooru.com and collect a list of working URLs linking directly to its
# content (images and so on) by indicating the tag or tags you have interest
# in. I was originally planing on including a feature to just download the
# images to a directory to begin with, but throttling was a major concern so
# instead it generates a neat .lst file which should be acceptable. Expect this
# script to break if the board admin drastically changes the layout.
#
# This script was working as of January 26, 2010.
#
# So it generated a .lst file, now what? Use wget like so:
# $ wget --user-agent="Opera/9.20 (Windows NT 6.0; U; en)" --limit-rate=10k \
# --wait=5 --random-wait -c -i tag.lst
#
# Please don't abuse this, throttle your stuff and let it run over night.

import sys
import random
import urllib2
#import time

if len(sys.argv) < 2:
    print "Usage: ./%s <tag(s)>" % sys.argv[0]
    print "Example: ./%s ice_cream maid" % sys.argv[0]
    sys.exit(1)

# A collection of common user agents spanning a variety of operating systems,
# browsers, and versions. Anything is better then 'Python-urllib/2.6'.
cua = ['Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.1) Opera 7.02 [en]',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20060127 Netscape/8.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3',
'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9a2) Gecko/20080530 Firefox/3.0a2',
'Mozilla/5.0 (X11; U; Linux i686; de-AT; rv:1.7.2) Gecko/20040810 Debian/1.7.2-2',
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.11) Gecko/2009060309 Ubuntu/9.04 (jaunty) Firefox/3.0.11',
'Opera/9.00 (Windows NT 5.1; U; en)',
'Opera/9.20 (Windows NT 6.0; U; en)',
'Opera/9.64 (Windows NT 6.0; U; es-ES) Presto/2.1.1',
'Opera/9.7 (Windows Mobile; PPC; Opera Mobi/35166; U; en) Presto/2.2.1',
'Opera/9.80 (Windows NT 5.1; U; en) Presto/2.2.15 Version/10.00']
ua = cua[random.randint(0,len(cua)-1)]

print "- Selected user agent %s" % ua

page = 0
pagelist = []
imglist = []
done = False

tags = sys.argv
del tags[0]
tags = sorted(tags)
alltags = ""
print "- Searching for images tagged with",
for tag in tags:
    if tags[len(tags)-1] != tag:
        print tag+",",
    else:
        print tag,
    if tags[len(tags)-1] == tag:
        alltags += tag
    elif tags[len(tags)-1] != tag:
        alltags += tag+"+"

while done == False:
    f = None
    attempt = 0
    while f == None:
        #time.sleep(random.randint(10,30)) # Sleep 10-30 seconds
        req = urllib2.Request('http://gelbooru.com/index.php?page=post&s=list&tags=%s&pid=%s' % (alltags,page*25))
        req.add_header('User-agent',ua)
        try:
            f = urllib2.urlopen(req)
        except:
            attempt += 1
            sys.stdout.write('!')
            sys.stdout.flush()
        else:
            sys.stdout.write('.')
            sys.stdout.flush()
        if attempt >= 5:
            print "\n- Failed, gave up after 5 attempts"
            sys.exit(1)
    html = f.read()
    i = 0
    for part in html.split('"'):
        if part.find('index.php?page=post&amp;s=view&amp;id=') != -1:
            i += 1
            pagelist.append(part[38:])
    page += 1
    if i == 0:
        done = True
pagelist = set(pagelist)
print "\n- Found %s pages, containing %s posts" % (page,len(pagelist)),

for imgpage in pagelist:
    f = None
    attempt = 0
    while f == None:
        #time.sleep(random.randint(10,30)) # Sleep 10-30 seconds
        req = urllib2.Request('http://gelbooru.com/index.php?page=post&s=view&id=%s' % imgpage)
        req.add_header('User-agent',ua)
        try:
            f = urllib2.urlopen(req)
        except:
            attempt += 1
            sys.stdout.write('!')
            sys.stdout.flush()
        else:
            sys.stdout.write('.')
            sys.stdout.flush()
        if attempt >= 5:
            print "\n- Failed, gave up after 5 attempts"
            sys.exit(1)
    html = f.readlines()
    for line in html:
        if line.find('Original image</a></li>') != -1:
            for part in line.split('"'):
                if part.find('gelbooru.com//images') != -1:
                    imglist.append(part)

if len(imglist) > 0:
    imglist = set(imglist)
    f = open(("%s.lst" % alltags), "w")
    for imgloc in imglist:
        f.write(imgloc+"\n")
    f.close()
    print "\n- Saved %s urls to %s.lst" % (len(imglist),alltags)
else:
    print "\n- Failed!"
    sys.exit(1)

print "- Done!"
sys.exit(0)
