Darn it, I thought I had it:
#tides for #jerseyci today, Thursday 1 October: 03:48 0.6m 09:21 11.7m 16:07 0.9m 21:42 11.5m data from http://mbcurl.me/13KDW
I've been successfully scraping daily tide data, and posting it on my Pi-hosted site here...
jcwyatt.ddns.net
and tweeting it here...
www.twitter.com/#jerseyci
which was a major goal (code is below). Chron runs this Python program every morning at 5:30am.
However now it has come to thinking about calculating live tide heights I've hit a wall when trying to use the data I'm currently scraping.
I've been working on daily data, when what is needed is continuous data over a longer time. Once I have that I think I can calculate live tide height with a rolling algorithm. Tides don't fit in neat daily chunks.
I'm going back to here: http://www.ports.je/Pages/tides.aspx to scrape a month's worth of data at a time and see how it goes.
The code below took a while and is pretty untidy, but it does what it needs to do, with some nifty string and list handling that I'm quite proud of.
#A python program to import tide data from a gov.je website
#tidescrape6.0.py - working fine
#It pulls the data in from the gov.je tide site, which is updated daily
#It looks for the class headers associated with date,time and height information
#and then creates a list of these bits of html
#this version(6.0) is called by a chrontab function and tweets at 5:30am everyday.
import tweepy
import smtplib
import urllib2
import re
from bs4 import BeautifulSoup
from time import sleep
import datetime as dt
#function to scrape tide data from website
def tidedatascrape():
#open site
rawhtml = urllib2.urlopen("http://www.gov.je/Weather/Pages/Tides.aspx").read(20000)
soup = BeautifulSoup(rawhtml)
#from http://stackoverflow.com/questions/14257717/python-beautifulsoup-wildcard-attribute-id-search
#get the dates:
tidedates = soup.findAll('td', {'class': re.compile('TidesDate.*')} )
#get the times:
tidetimes = soup.findAll('td', {'class': re.compile('TidesTime.*')} )
#get the heights:
tideheights = soup.findAll('td', {'class': re.compile('TidesHeight.*')} )
#collect together the data for today
todaysdate = tidedates[0].get_text()
print (todaysdate)
todaystimes = tidetimes[0].get_text()
print (todaystimes)
todaysheights = tideheights[0].get_text()
print (todaysheights)
#parse the times (always a 5 character string)
ttime = [0,0,0,0]
for i in range (0,4):
ttime[i]=todaystimes[5*i:(5*i+5)]
print ttime[i]
#parse the heights (3 or 4 ch string delimited by 'm' e.g 2.5m3.4m etc)
theight = ['','','','']
list_index = 0
for i in todaysheights:
if i == 'm':
list_index += 1
else:
theight[list_index] = theight[list_index] + i
print theight[0]
#create a tweetable string of all the data
tweetstring = ('#tides for #jerseyci today, ' + todaysdate + ':\n')
for i in range (0,4):
tweetstring = tweetstring + (ttime[i] + ' ' + theight[i] + 'm\n')
tweetstring = tweetstring + 'data from http://mbcurl.me/13KDW'
print tweetstring
return tweetstring
#print len(tweetstring) #just to check it is within 140 characters
#function to write to a text file
def writetidestofile(tweetstring):
with open('/var/www/dailytideoutput.txt','w') as f:
f.write(str(tweetstring))
f.close()
#function to tweet it
def tweettidedata(tweetstring):
CONSUMER_KEY = '0000000000000000000'#keep the quotes, replace this with your consumer key
CONSUMER_SECRET = '0000000000000000000
0000000000000000000'#keep the quotes, replace this with your consumer secret key
ACCESS_KEY = '0000000000000000000
0000000000000000000'#keep the quotes, replace this with your access token
ACCESS_SECRET = '0000000000000000000
0000000000000000000'#keep the quotes, replace this with your access token secret
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
api = tweepy.API(auth)
api.update_status(status=tweetstring) #THIS LINE TWEETS! - LEAVE DEACTIVATED UNTIL READY
#email it(commented out for now)
'''
fromaddr = 'jbloggs@gmail.com'
toaddr = 'j.bloggette@free.sch.uk'
# Credentials (if needed)
username = raw_input('gmail un: ')
password = raw_input('gmail pw: ')
# The actual mail send
server = smtplib.SMTP('smtp.gmail.com:587')
server.ehlo()
server.starttls()
server.login(username,password)
headers = "\r\n".join(["from: " + fromaddr,
"subject: " + 'Tides Today',
"to: " + toaddr,
"mime-version: 1.0",
"cont#ent-type: text/html"])
# body_of_email can be plaintext or html!
content = headers + "\r\n\r\n" + tweetstring
server.sendmail(fromaddr, toaddr, content)
server.quit
'''
#main prog
#collect data
tweetstring = tidedatascrape()
#output to file
writetidestofile(tweetstring)
#tweet data
tweettidedata(tweetstring)
No comments:
Post a Comment