import os, urllib.request, sys, re, collections, multiprocessing as mp
ParrotType = collections.namedtuple("ParrotType", ["groupName", "description"])
Result = collections.namedtuple("Result", ["pageNumber", "counts"])
ProcessPoolSize = 5
ParrotRegex = re.compile(r'img src="/forums/smiles/('
r'(?P<normal>bird\.gif)|'
r'(?P<hat>wally_the_prestigious_monocled_bird\.gif)'
r')"')
Types = [ParrotType("hat", "Parrot with hat"),
ParrotType("normal", "Normal parrot")]
def countInString(ParrotRegex, Types, counts, string):
for match in ParrotRegex.finditer(string):
i, parrotType = next((i, parrotType) for i, parrotType in
enumerate(Types) if match.group(parrotType.groupName) != None)
counts[i] += 1
def printResults(counts):
for i, parrotType in enumerate(Types):
print("{0}: {1}".format(parrotType.description, counts[i]))
print("Total: " + str(sum(counts)))
def addCounts(counts1, counts2):
return [x1 + x2 for x1, x2 in zip(counts1, counts2)]
def countOnWebPage(ParrotRegex, Types, pageUrl):
with urllib.request.urlopen(pageUrl) as request:
if request.url != pageUrl:
return None
ret = [0 for _ in Types]
countInString(ParrotRegex, Types, ret, request.read().decode("utf-8"))
return ret
def countIntoQueue(ParrotRegex, Types, pageNumbersQueue, resultsQueue,
threadUrl):
processingRequests = True
while True:
requestedPageNo = pageNumbersQueue.get()
if requestedPageNo is None:
pageNumbersQueue.task_done()
return
if processingRequests:
pageUrl = threadUrl + "/" if requestedPageNo == 1 else \
"{0}/page-{1}".format(threadUrl, requestedPageNo)
counts = countOnWebPage(ParrotRegex, Types, pageUrl)
if counts is None:
processingRequests = False
resultsQueue.put(Result(requestedPageNo, counts))
pageNumbersQueue.task_done()
def poisonQueue(queue):
for i in range(ProcessPoolSize):
queue.put(None)
counts = [0 for _ in Types]
threadUrl = sys.argv[1].strip("/")
resultsQueue = mp.Queue()
pageNumbersQueue = mp.JoinableQueue()
highestRequestedPageNumber = 0
def updateJobs(highestPageNoWithResponse):
global highestRequestedPageNumber
highestPageNumberToTry = highestPageNoWithResponse + ProcessPoolSize
for i in range(highestRequestedPageNumber + 1, highestPageNumberToTry + 1):
pageNumbersQueue.put(i)
highestRequestedPageNumber = highestPageNumberToTry
for _ in range(ProcessPoolSize):
counterProcess = mp.Process(target=countIntoQueue, args=(ParrotRegex, Types,
pageNumbersQueue, resultsQueue, threadUrl))
counterProcess.start()
updateJobs(0)
while True:
result = resultsQueue.get()
if result.counts == None:
poisonQueue(pageNumbersQueue)
pageNumbersQueue.join()
break
else:
updateJobs(result.pageNumber)
counts = addCounts(counts, result.counts)
while not resultsQueue.empty():
result = resultsQueue.get()
if result.counts != None:
counts = addCounts(counts, result.counts)
printResults(counts)