Python 4chan downloader

December 14, 2014 by Asbra — 2 Comments

A 4chan downloader/leecher I wrote while learning Python.
If you just execute it, it scrapes the catalog page of specified section, then scrapes all threads and downloads all images from them.

catalog function returns a list of all threads on the specified board
thread function returns a list of all images in specified thread id

Python is quite a nice language, very simple syntax and a lot of libraries to use. Started learning Python for a new project I’m working on, mostly to get a decent solution for multi-threaded scraping/botting.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @author:      johan
# @date:        2014-12-12
# @modified_by: johan
# @modified_at: 2014-12-12

import re        # Regular expressions
import requests  # To make HTTP requests
import json      # To parse 4chan's JSON
import shutil    # Used when downloading file
import os        # For creating folders

class fchan(object) :

	def __init__(self) :
		return

	def catalog(self, section) :
		print 'Grabbing catalog for /'+section+'/ ..'

		# Build Url
		url = 'https://boards.4chan.org/'+section+'/catalog'

		# Download page
		r = requests.get(url)

		# Error handling
		if r.status_code != 200 or not r.content :
			print 'Failed to read catalog. Are you sure that there is a /'+section+'/ section?'
			return []

		threads = []

		# Find all threads in the JSON data
		m = re.findall(r'var catalog = (.*)?};', r.content)
		j = m[0]+'}'

		d = json.loads(j)

		for t in d['threads'].iteritems() :
			threads.append(t[0])

		print 'Found '+str(len(threads))+' threads'

		return threads

	def thread(self, section, id) :
		print 'Grabbing thread /'+section+'/thread/'+id+'/'

		# Build Url
		url = 'https://boards.4chan.org/'+section+'/thread/'+id+'/'

		# Download page
		r = requests.get(url)

		# Error handling
		if r.status_code != 200 or not r.content :
			print 'Failed to read thread /'+section+'/thread/'+id+'/'
			return []

		# Find all images in thread
		m = re.findall(r'href=".*?(\/\/i.4cdn.org\/[a-z]+\/[0-9]+.(jpg|jpeg|png|gif|webm))"', r.content)

		images = []

		for i in m :
			images.append(i[0])

		print 'Found '+str(len(images))+' images in thread /'+section+'/thread/'+id+'/'

		return self.uniq(images)

	# Remove duplicate elements in list
	def uniq(self, seq) :
		seen = set()
		seen_add = seen.add
		return [x for x in seq if not (x in seen or seen_add(x))]

if __name__ == '__main__':
	import sys

	# Input validation
	if len(sys.argv) < 2 or not sys.argv[1] :
		print '4chan.py <section>'
		print 'No section given'
		sys.exit()

	section = sys.argv[1]

	chan = fchan()

	# Get all threads from the catalog
	threads = chan.catalog(section)

	# Create section folder if it doesn't exist
	if not os.path.exists(section) :
		os.makedirs(section)

	# Iterate all threads and download images
	for thread in threads :
		# Get images
		images = chan.thread(section, thread)

		# Create image folder if it doesn't exist
		if not os.path.exists(section+'/'+thread) :
			os.makedirs(section+'/'+thread)

		# Iterate images list and download them
		for image in images :
			match = re.findall(r'\/([0-9]+.(jpg|jpeg|png|gif|webm))$', image)

			if match[0] :
				filename = section+'/'+thread+'/'+match[0][0]

				print 'Downloading /'+filename

				# Download image
				q = requests.get('https:'+image, stream=True)

				with open(filename, 'wb') as f :
					q.raw.decode_content = True
					shutil.copyfileobj(q.raw, f)

Github Gist (4chan downloader)

Asbra

Posts Facebook

Blogging out of many years of experience with gamehacking, programming, reverse-engineering and general tomfoolery.

2 responses to Python 4chan downloader

  1. Hey! is there any way to contact you? I really love what you do, i study computer engineering and would like to know more about reverse 😛 Maybe you can handle me some useful tutorials

Leave a Reply