Continue reading
from random import randint
import png
def main():
num_iterations = 1000000
dim = 600
corners = [{'x': 600, 'y': 0}, {'x': 0, 'y': 300}, {'x': 600, 'y': 600}]
grid = []
for i in range(dim):
grid.append([1]*dim)
x = dim
y = dim
for idx in range(num_iterations):
a = randint(0, 2)
x = (x + corners[a]['x']) / 2
y = (y + corners[a]['y']) / 2
grid[x][y] = 0
f = open('sierpinski.png', 'wb')
w = png.Writer(len(grid[0]), len(grid), greyscale=True, bitdepth=1)
w.write(f, grid)
f.close()
if __name__ == '__main__':
main()
N=2 => space size = 37**2 => image dimensions = 37 x 37
N=3 => space size = 37**3 => image dimensions = 226 x 226
N=4 => space size = 37**4 => image dimensions = 1369 x 1369
N=5 => space size = 37**5 => image dimensions = 8328 x 8328
N=6 => space size = 37**6 => image dimensions = 50653 x 50653
AA AB AC AD AE AF AG AH AI AJ AK AL AM AN AO AP AQ AR AS AT AU AV AW AX AY AZ A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 0 BA BB BC BD BE BF BG BH BI BJ BK BL BM BN BO BP BQ BR BS BT BU BV BW BX BY BZ B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 0 CA CB CC CD CE CF CG CH CI CJ CK CL CM CN CO CP CQ CR CS CT CU CV CW CX CY CZ C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 0 DA DB DC DD DE DF DG DH DI DJ DK DL DM DN DO DP DQ DR DS DT DU DV DW DX DY DZ D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 0 EA EB EC ED EE EF EG EH EI EJ EK EL EM EN EO EP EQ ER ES ET EU EV EW EX EY EZ E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 0 FA FB FC FD FE FF FG FH FI FJ FK FL FM FN FO FP FQ FR FS FT FU FV FW FX FY FZ F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 0 GA GB GC GD GE GF GG GH GI GJ GK GL GM GN GO GP GQ GR GS GT GU GV GW GX GY GZ G0 G1 G2 G3 G4 G5 G6 G7 G8 G9 0 HA HB HC HD HE HF HG HH HI HJ HK HL HM HN HO HP HQ HR HS HT HU HV HW HX HY HZ H0 H1 H2 H3 H4 H5 H6 H7 H8 H9 0 IA IB IC ID IE IF IG IH II IJ IK IL IM IN IO IP IQ IR IS IT IU IV IW IX IY IZ I0 I1 I2 I3 I4 I5 I6 I7 I8 I9 0 JA JB JC JD JE JF JG JH JI JJ JK JL JM JN JO JP JQ JR JS JT JU JV JW JX JY JZ J0 J1 J2 J3 J4 J5 J6 J7 J8 J9 0 KA KB KC KD KE KF KG KH KI KJ KK KL KM KN KO KP KQ KR KS KT KU KV KW KX KY KZ K0 K1 K2 K3 K4 K5 K6 K7 K8 K9 0 LA LB LC LD LE LF LG LH LI LJ LK LL LM LN LO LP LQ LR LS LT LU LV LW LX LY LZ L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 0 MA MB MC MD ME MF MG MH MI MJ MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ M0 M1 M2 M3 M4 M5 M6 M7 M8 M9 0 NA NB NC ND NE NF NG NH NI NJ NK NL NM NN NO NP NQ NR NS NT NU NV NW NX NY NZ N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 0 OA OB OC OD OE OF OG OH OI OJ OK OL OM ON OO OP OQ OR OS OT OU OV OW OX OY OZ O0 O1 O2 O3 O4 O5 O6 O7 O8 O9 0 PA PB PC PD PE PF PG PH PI PJ PK PL PM PN PO PP PQ PR PS PT PU PV PW PX PY PZ P0 P1 P2 P3 P4 P5 P6 P7 P8 P9 0 QA QB QC QD QE QF QG QH QI QJ QK QL QM QN QO QP QQ QR QS QT QU QV QW QX QY QZ Q0 Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 0 RA RB RC RD RE RF RG RH RI RJ RK RL RM RN RO RP RQ RR RS RT RU RV RW RX RY RZ R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 0 SA SB SC SD SE SF SG SH SI SJ SK SL SM SN SO SP SQ SR SS ST SU SV SW SX SY SZ S0 S1 S2 S3 S4 S5 S6 S7 S8 S9 0 TA TB TC TD TE TF TG TH TI TJ TK TL TM TN TO TP TQ TR TS TT TU TV TW TX TY TZ T0 T1 T2 T3 T4 T5 T6 T7 T8 T9 0 UA UB UC UD UE UF UG UH UI UJ UK UL UM UN UO UP UQ UR US UT UU UV UW UX UY UZ U0 U1 U2 U3 U4 U5 U6 U7 U8 U9 0 VA VB VC VD VE VF VG VH VI VJ VK VL VM VN VO VP VQ VR VS VT VU VV VW VX VY VZ V0 V1 V2 V3 V4 V5 V6 V7 V8 V9 0 WA WB WC WD WE WF WG WH WI WJ WK WL WM WN WO WP WQ WR WS WT WU WV WW WX WY WZ W0 W1 W2 W3 W4 W5 W6 W7 W8 W9 0 XA XB XC XD XE XF XG XH XI XJ XK XL XM XN XO XP XQ XR XS XT XU XV XW XX XY XZ X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 0 YA YB YC YD YE YF YG YH YI YJ YK YL YM YN YO YP YQ YR YS YT YU YV YW YX YY YZ Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7 Y8 Y9 0 ZA ZB ZC ZD ZE ZF ZG ZH ZI ZJ ZK ZL ZM ZN ZO ZP ZQ ZR ZS ZT ZU ZV ZW ZX ZY ZZ Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 Z8 Z9 0 0A 0B 0C 0D 0E 0F 0G 0H 0I 0J 0K 0L 0M 0N 0O 0P 0Q 0R 0S 0T 0U 0V 0W 0X 0Y 0Z 00 01 02 03 04 05 06 07 08 09 0 1A 1B 1C 1D 1E 1F 1G 1H 1I 1J 1K 1L 1M 1N 1O 1P 1Q 1R 1S 1T 1U 1V 1W 1X 1Y 1Z 10 11 12 13 14 15 16 17 18 19 0 2A 2B 2C 2D 2E 2F 2G 2H 2I 2J 2K 2L 2M 2N 2O 2P 2Q 2R 2S 2T 2U 2V 2W 2X 2Y 2Z 20 21 22 23 24 25 26 27 28 29 0 3A 3B 3C 3D 3E 3F 3G 3H 3I 3J 3K 3L 3M 3N 3O 3P 3Q 3R 3S 3T 3U 3V 3W 3X 3Y 3Z 30 31 32 33 34 35 36 37 38 39 0 4A 4B 4C 4D 4E 4F 4G 4H 4I 4J 4K 4L 4M 4N 4O 4P 4Q 4R 4S 4T 4U 4V 4W 4X 4Y 4Z 40 41 42 43 44 45 46 47 48 49 0 5A 5B 5C 5D 5E 5F 5G 5H 5I 5J 5K 5L 5M 5N 5O 5P 5Q 5R 5S 5T 5U 5V 5W 5X 5Y 5Z 50 51 52 53 54 55 56 57 58 59 0 6A 6B 6C 6D 6E 6F 6G 6H 6I 6J 6K 6L 6M 6N 6O 6P 6Q 6R 6S 6T 6U 6V 6W 6X 6Y 6Z 60 61 62 63 64 65 66 67 68 69 0 7A 7B 7C 7D 7E 7F 7G 7H 7I 7J 7K 7L 7M 7N 7O 7P 7Q 7R 7S 7T 7U 7V 7W 7X 7Y 7Z 70 71 72 73 74 75 76 77 78 79 0 8A 8B 8C 8D 8E 8F 8G 8H 8I 8J 8K 8L 8M 8N 8O 8P 8Q 8R 8S 8T 8U 8V 8W 8X 8Y 8Z 80 81 82 83 84 85 86 87 88 89 0 9A 9B 9C 9D 9E 9F 9G 9H 9I 9J 9K 9L 9M 9N 9O 9P 9Q 9R 9S 9T 9U 9V 9W 9X 9Y 9Z 90 91 92 93 94 95 96 97 98 99 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
from math import sqrt
import png
def permutations(items, n):
if n == 0:
yield ''
else:
for i in range(len(items)):
for base in permutations(items, n-1):
yield str(items[i]) + str(base)
def square_dimension(n):
srn = sqrt(n)
if srn > int(srn):
return int(srn) + 1
else:
return int(srn)
def get_tlds(word_len):
f = open('com.zone', 'r')
f.seek(811)
s = set()
for line in f:
tld = line.split(' ')[0]
if len(tld) == word_len:
s.add(tld)
return s
def main():
# Valid characters.
alphabet = ['A', 'B', 'C', 'D', 'E', 'F',
'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R',
'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', '0', '1', '2', '3',
'4', '5', '6', '7', '8', '9',
'-']
# Length of words to visualize.
word_len = 4
# Total word space size for strings of length word_len.
n = len(alphabet)**word_len
# Size of square that will hold entire word space. For when 2 does not divide word_len.
dim = square_dimension(n)
# Initalize empty grid
data = []
for i in range(dim):
data.append([0]*dim)
# Build a set of all TLDs of length word_len.
tlds = get_tlds(word_len)
# Create pixel data.
for idx, word in enumerate(permutations(alphabet, word_len)):
if word.startswith('-') or word.endswith('-'):
continue
i = idx // dim
j = idx % dim
if word in tlds:
data[i][j] = 1
else:
data[i][j] = 0
f = open('4.png', 'wb')
w = png.Writer(len(data[0]), len(data), greyscale=True, bitdepth=1)
w.write(f, data)
f.close()
if __name__ == '__main__':
main()
3 5 6 9 1 0 4 4 9 5 1 8 2 5 6 5 5 0 8 3 4 9 6 0 9 8 9 6 6 4 1 7 4 2 3 2 2 2 2 2 5 9 3 8 9 0 3 8 8 1 9 3 3 8 9 9 0 6 0 1 2 2 8 4 3 7 7 2 3 7 6 3 3 1 7 4 8 0 4 0 5 8 2 9 5 9 1 8 8 6 3 4 7 7 1 4 6 9 2 7
colors = ximport("colors")
def pi_digits():
"""
Generator for digits of pi
See:
http://web.comlab.ox.ac.uk/people/Jeremy.Gibbons/publications/spigot.pdf
"""
q,r,t,k,n,l = 1,0,1,1,3,3
while True:
if 4*q+r-t < n*t:
yield n
q,r,t,k,n,l = (10*q,10*(r-n*t),t,k,(10*(3*q+r))/t-10*n,l)
else:
q,r,t,k,n,l = (q*k,(2*q+r)*l,t*l,k+1,(q*(7*k+2)+r*l)/(t*l),l+2)
def get_grid_index(pi_grid, i, j):
try:
up = pi_grid[i-1][j]
down = pi_grid[i+1][j]
left = pi_grid[i][j-1]
right = pi_grid[i][j+1]
except IndexError:
up = pi_grid[i-1][j]
down = pi_grid[i][j]
left = pi_grid[i][j-1]
right = pi_grid[i][j]
return up + 10*down + 100*left + 1000*right
# Begin drawing
rows = 50
cols = 50
# Initialize an empty matrix
pi_grid = []
for i in range(cols):
pi_grid.append([0]*rows)
# Fill the matrix with the digits of pi
digits = pi_digits()
for i in range(cols):
for j in range(rows):
pi_grid[i][j] = digits.next()
# Canvas size
size(50*25, 50*25)
# Black lines
stroke(0)
# Scale and width parameters in pixels
s_px = 25
w_px = 25
# Color lists
circle_clrs = colors.gradient([color(0.15, 0.1, 0.7), color(1, 1, 1)], steps=10000, spread=0.15)
square_clrs = colors.gradient([color(1, 1, 1), color(0.15, 0.1, 0.7)], steps=10000, spread=0.15)
# Draw to the canvas
for i in range(cols):
for j in range(rows):
x = (i)*s_px
y = (j)*s_px
d = pi_grid[i][j]*w_px/10
idx = get_grid_index(pi_grid, i, j)
rect(x, y, w_px, w_px, fill=square_clrs[idx])
oval(x+(w_px/2. - d/2.), y+(w_px/2. - d/2.), d, d, fill=circle_clrs[idx])
rows = 15
cols = 10
iters = 10
s_px = 40
w_px = 55
width = (iters - 1 + cols - 1) * s_px + w_px + 0.5
height = (rows - 1 + cols - 1) * s_px + w_px + 0.5
size(width, height)
for i in range(cols):
for j in range(rows):
for k in range(iters):
if (random() > 0.5):
stroke(1, 1, 1, random())
fill(0, 0.6, 0.9, random()*.15)
else:
fill(0.7, 0.5, 0.6, random()*.15)
stroke(0, 0, 0, random())
rect((i+k)*s_px, j*s_px, w_px, w_px)
rect((i+k)*s_px, (j+iters+5)*s_px, w_px, w_px)
t_i appears. Or, in symbols:
text field contains a free-form English paragraph. In order to be able to compute the TF-IDF weights as described above, it is necessary tokenize this paragraph. To do this, I chose to use the NLTK library which is a collection of natural language processing algorithms written in Python. Tokenizing the documents in the corpus is a two step process: first the text is split into sentences, and then the sentences are split into the individual words. To tokenize the sentences I use the Punkt sentence tokenizer, while to tokenize the words I use the Treebank word tokenizer. Interested readers can find more information about these algorithms in the NLTK documentation. The code shown below will tokenize each document in the corpus and compute the normalized term frequencies.
import os
import nltk
import couchdb
def tokenize_document(document):
tokens = []
sentences = nltk.sent_tokenize(document)
for sentence in sentences:
words = nltk.word_tokenize(sentence)
for word in words:
tokens.append(word)
return tokens
def clean_tokens(tokens):
clean_tokens = []
for token in tokens:
if len(token) < 2: # Only save words longer than 2 chars
continue
clean_tokens.append(token.lower()) # Always lower case
return clean_tokens
def process_document(document):
return clean_tokens(tokenize_document(document))
def get_term_freq(tokens):
tf = {}
for token in tokens:
if token not in tf:
tf[token] = float(tokens.count(token))/float(len(tokens))
return tf
def main():
server = couchdb.Server('http://127.0.0.1:5984')
db = server['rcv1_test_100']
# Tokenize and compute term frequencies for all the documents in the db.
for result in db.query('''function(doc) { if (!('tf' in doc)) emit(doc._id, null); }'''):
doc = db[result.key]
tokens = process_document(doc['text'])
doc['tokens'] = tokens
doc['tf'] = get_term_freq(tokens)
db[result.key] = doc
if __name__ == '__main__':
main()
tf field are by construction, unique, we can use map-reduce to count the number of documents each term appears in across the entire corpus. Note that to compute the Inverse Document Frequency, I also need to know the total number of documents in the corpus. I present both views below.
/* document frequency: 'docfreq' */
map:function(doc) {
if ("tf" in doc) {
for (term in doc.tf) {
emit(term, 1);
}
}
}
reduce:function(keys, values) {
return sum(values);
}
/* all tokenized documents: 'all_docs_count' */
map:function(doc) {
if ("tf" in doc) {
emit(null, 1);
}
}
reduce:function(keys, values) {
return sum(values);
}
from math import log
import couchdb
def main():
server = couchdb.Server('http://127.0.0.1:5984')
db = server['rcv1_test_100']
# The number of documents in the database (exclude design docs).
for result in db.view('_design/test/_view/all_docs_count'):
num_docs = result.value
# Document frequency. Number of documents a term appears in.
df = {}
for result in db.view('_design/test/_view/docfreq', group=True):
df[result.key] = log(float(num_docs)/float(result.value))
# Compute tf-idf for each document.
for result in db.query('''function(doc) { if (!('tf_idf' in doc)) emit(doc._id, null); }'''):
doc = db[result.key]
tf = doc['tf']
tf_idf = {}
for k, v in tf.items(): # By construction, there should never be KeyError's here
tf_idf[k] = v * df[k]
doc['tf_idf'] = tf_idf
db[result.key] = doc
if __name__ == '__main__':
main()
{
"_id": "99642-1996-10-07",
"_rev": "3-4daa8aea56f3be7af795c5568982859b",
"itemid": "99642",
"codes": {
"topics": [
"M14",
"M141",
"MCAT"
],
"countries": [
"ARG"
]
},
"title": "ARGENTINA: Argentine cash maize drifts lower, weather good.",
"text": "Argentine cash maize drifted lower in lazy trade Monday, with
continuing good waether leaving the market relaxed about crop prospects and
some exporters selling, traders said. January wheat also slipped in slow
trade. Cash soybean prices were stable in low volume. The market was a bit
lost due to a holiday in the main soybean center, Rosario. But traders said
underlying demand was good. Cash sunflower seed fell in thin turnover. Oil
prices are weak and crushers have built up stocks of seed. -- Jason Webb,
Buenos Aires Newsroom +541 318-0655",
"tokens": [
"argentine", "cash", "maize", "drifted", "lower", "in", "lazy",
"trade", "monday", "with", "continuing", "good", "waether", "leaving",
"the", "market", "relaxed", "about", "crop", "prospects", "and", "some",
"exporters", "selling", "traders", "said", "january", "wheat", "also",
"slipped", "in", "slow", "trade", "cash", "soybean", "prices", "were",
"stable", "in", "low", "volume", "the", "market", "was", "bit", "lost",
"due", "to", "holiday", "in", "the", "main", "soybean", "center",
"rosario", "but", "traders", "said", "underlying", "demand", "was", "good",
"cash", "sunflower", "seed", "fell", "in", "thin", "turnover", "oil",
"prices", "are", "weak", "and", "crushers", "have", "built", "up",
"stocks", "of", "seed", "--", "jason", "webb", "buenos", "aires",
"newsroom", "541", "318-0655"
],
"date": "1996-10-07",
"tf": {
"and": 0.02247191011235955, "webb": 0.011235955056179775, "jason":
0.011235955056179775, "have": 0.011235955056179775, "slipped":
0.011235955056179775, "some": 0.011235955056179775, "crop":
0.011235955056179775, "trade": 0.02247191011235955, "seed":
0.02247191011235955, "are": 0.011235955056179775, "soybean":
0.02247191011235955, "in": 0.056179775280898875, "drifted":
0.011235955056179775, "leaving": 0.011235955056179775, "market":
0.02247191011235955, "maize": 0.011235955056179775, "said":
0.02247191011235955, "built": 0.011235955056179775, "sunflower":
0.011235955056179775, "also": 0.011235955056179775, "newsroom":
0.011235955056179775, "due": 0.011235955056179775, "cash":
0.033707865168539325, "traders": 0.02247191011235955, "to":
0.011235955056179775, "exporters": 0.011235955056179775, "541":
0.011235955056179775, "low": 0.011235955056179775, "buenos":
0.011235955056179775, "stable": 0.011235955056179775, "holiday":
0.011235955056179775, "was": 0.02247191011235955, "argentine":
0.011235955056179775, "lazy": 0.011235955056179775, "selling":
0.011235955056179775, "wheat": 0.011235955056179775, "monday":
0.011235955056179775, "continuing": 0.011235955056179775, "relaxed":
0.011235955056179775, "waether": 0.011235955056179775, "weak":
0.011235955056179775, "lower": 0.011235955056179775, "oil":
0.011235955056179775, "volume": 0.011235955056179775, "were":
0.011235955056179775, "slow": 0.011235955056179775, "rosario":
0.011235955056179775, "but": 0.011235955056179775, "demand":
0.011235955056179775, "prices": 0.02247191011235955, "318-0655":
0.011235955056179775, "bit": 0.011235955056179775, "stocks":
0.011235955056179775, "with": 0.011235955056179775, "prospects":
0.011235955056179775, "main": 0.011235955056179775, "center":
0.011235955056179775, "lost": 0.011235955056179775, "--":
0.011235955056179775, "of": 0.011235955056179775, "up":
0.011235955056179775, "crushers": 0.011235955056179775, "good":
0.02247191011235955, "underlying": 0.011235955056179775, "thin":
0.011235955056179775, "january": 0.011235955056179775, "about":
0.011235955056179775, "the": 0.033707865168539325, "turnover":
0.011235955056179775, "fell": 0.011235955056179775, "aires":
0.011235955056179775
},
"tf_idf": {
"and": 0.004735304074509046, "webb": 0.043955314667731976, "january":
0.02837897353155343, "jason": 0.05174348523582126, "have":
0.01315935934272972, "slipped": 0.03939952693617957, "up":
0.01179575420785031, "some": 0.016513213146729683, "crop":
0.03939952693617957, "trade": 0.038534796136897226, "seed":
0.08791062933546395, "are": 0.009747197389940709, "soybean":
0.10348697047164251, "in": 0.010467953830982783, "drifted":
0.05174348523582126, "aires": 0.03939952693617957, "crushers":
0.05174348523582126, "market": 0.033026426293459366, "maize":
0.03939952693617957, "said": 0.006167120128129446, "built":
0.043955314667731976, "sunflower": 0.043955314667731976, "to":
0.0019590268218514354, "newsroom": 0.014302985121493118, "due":
0.02382318580000102, "stocks": 0.027055568636537883, "traders":
0.04584765906801246, "also": 0.016513213146729683, "541":
0.043955314667731976, "low": 0.02837897353155343, "buenos":
0.03939952693617957, "stable": 0.043955314667731976, "holiday":
0.03939952693617957, "was": 0.015576341136178545, "argentine":
0.03939952693617957, "thin": 0.043955314667731976, "lazy":
0.043955314667731976, "selling": 0.033659913185999896, "wheat":
0.0361671440996427, "monday": 0.017012671153143543, "weak":
0.0361671440996427, "relaxed": 0.05174348523582126, "waether":
0.05174348523582126, "continuing": 0.0361671440996427, "prospects":
0.03939952693617957, "oil": 0.02987932625767166, "volume":
0.02837897353155343, "slow": 0.043955314667731976, "but":
0.011171373857796258, "demand": 0.02480084172123282, "prices":
0.042631909772716435, "318-0655": 0.043955314667731976, "bit":
0.043955314667731976, "exporters": 0.03939952693617957, "with":
0.006717269671411465, "lower": 0.027055568636537883, "main":
0.02837897353155343, "center": 0.05174348523582126, "lost":
0.031611356368090295, "--": 0.010017956396447005, "of":
0.0011838260186272623, "cash": 0.07761522785373189, "leaving":
0.05174348523582126, "good": 0.06322271273618059, "underlying":
0.043955314667731976, "rosario": 0.05174348523582126, "were":
0.01179575420785031, "about": 0.015576341136178545, "the":
0.0043089900508949995, "fell": 0.02987932625767166, "turnover":
0.03939952693617957
}
}
"""
This script submits queries to Reuters' Spotlight service, and persists
the results in CouchDB. You must have a (free) API key from Reuters to
use their service.
It is assumed that CouchDB is running locally on the default port and that a
database called 'reuters_test' exists.
Depends on:
couchdb-python: http://code.google.com/p/couchdb-python/
Further reading:
http://spotlight.reuters.com/ (spotlight)
http://couchdb.apache.org/ (couchdb)
"""
import sys
import urllib
import simplejson
import couchdb
REUTERS_BASE = 'http://spotlight.reuters.com/api/feed'
REUTERS_ERROR_MSG = r'''<error code="1000" title="Error"></error>'''
APIKEY = 'YOUR API KEY'
# For more choices see http://spotlight.reuters.com/handbook/content-feeds
CHANNELS = ['topNews', 'newsOne', 'businessNews']
class ReutersError(Exception):
pass
def get_articles(content='channelarticles',
edition='us',
channel='topnews',
format='json',
version='1.0',
count=1,
callback='x',
**kwargs):
"""
Submits a query to Reuter's spotlight API.
Returns the 'items' field of the parsed JSON object.
"""
kwargs.update({
'content': content,
'edition': edition,
'channel': channel,
'format': format,
'version': version,
'count': count,
'callback': callback,
'apikey': APIKEY
})
url = REUTERS_BASE + '?' + urllib.urlencode(kwargs)
content = urllib.urlopen(url).read()
if REUTERS_ERROR_MSG in content:
raise ReutersError(content)
else:
ucontent = unicode(content[2:len(content)-2], 'UTF-8')
ucontent = ucontent.replace('\\\'','\\\\\'')
result = simplejson.loads(ucontent)
return result['items']
def main():
"""
Downloads news articles from Reuter's Spotlight API and stores them in
CouchDB.
The data admits a natural key as (`guid` + `published`). When inserting
articles, the procedure is to check if the document exists in the
database, insert it if it does not, otherwise create a new revision.
There is a potential race condition in the delay between db.get() and
db[doc_id].
"""
# Get the articles from Spotlight
all_articles = []
try:
for channel in CHANNELS:
articles = get_articles(channel=channel)
if articles is None:
continue
for article in articles:
all_articles.append(article)
except ReutersError, e:
print>>sys.stderr, 'Error: ReutersError %s' % e
# Put the articles in CouchDB
try:
server = couchdb.Server('http://127.0.0.1:5984')
db = server['reuters_test']
for article in all_articles:
doc_id = article['guid'] + article['published']
doc = db.get(doc_id)
if doc is None:
db[doc_id] = article
else:
article['_rev'] = doc['_rev']
db[doc_id] = article
except couchdb.client.ResourceNotFound, e:
print>>sys.stderr, 'Error: ResourceNotFound %s' % e
except couchdb.client.ResourceConflict, e:
print>>sys.stderr, 'Error: ResourceConflict %s' % e
except Exception, e:
print>>sys.stderr, 'Error: %s' % e
if __name__ == '__main__':
main()