<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>BRADINO &#187; Screen Scraping</title>
	<atom:link href="http://www.bradino.com/category/screen-scraping/feed/" rel="self" type="application/rss+xml" />
	<link>http://www.bradino.com</link>
	<description>LAMP Development Tutorials, Code, Tips &#38; Tricks</description>
	<lastBuildDate>Sat, 10 Oct 2009 20:49:03 +0000</lastBuildDate>
	<generator>http://wordpress.org/?v=2.8.4</generator>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
			<item>
		<title>PHP Screen Scraping Class</title>
		<link>http://www.bradino.com/php/php-screen-scraping-class/</link>
		<comments>http://www.bradino.com/php/php-screen-scraping-class/#comments</comments>
		<pubDate>Fri, 31 Jul 2009 01:04:44 +0000</pubDate>
		<dc:creator>BRADINO</dc:creator>
				<category><![CDATA[PHP]]></category>
		<category><![CDATA[Screen Scraping]]></category>

		<guid isPermaLink="false">http://www.bradino.com/?p=4882</guid>
		<description><![CDATA[After some positive feedback I have decided to continue to develop the PHP Screen Scraping class. This post will server as the permanent home for the class.
Download PHP Screen Scraping Class
Updates
 20009-07-30 Added setHeader() function




	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	


]]></description>
			<content:encoded><![CDATA[<p>After some positive feedback I have decided to continue to develop the PHP Screen Scraping class. This post will server as the permanent home for the class.</p>
<p><a href="http://www.bradino.com/downloads/cScrape.txt" target="blank">Download PHP Screen Scraping Class</a></p>
<p><strong>Updates</strong></p>
<p> 20009-07-30 Added setHeader() function</p>

<div class="sociable">

<ul>
	<li><a rel="nofollow" target="_blank" href="http://digg.com/submit?phase=2&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class" title="Digg"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/digg.png" title="Digg" alt="Digg" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://twitter.com/home?status=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F" title="TwitThis"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/twitter.gif" title="TwitThis" alt="TwitThis" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://del.icio.us/post?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class" title="del.icio.us"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/delicious.png" title="del.icio.us" alt="del.icio.us" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.netvouz.com/action/submitBookmark?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class&amp;popup=no" title="Netvouz"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/netvouz.png" title="Netvouz" alt="Netvouz" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.dzone.com/links/add.html?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class" title="description"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/dzone.png" title="description" alt="description" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://reddit.com/submit?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class" title="Reddit"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/reddit.png" title="Reddit" alt="Reddit" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.furl.net/storeIt.jsp?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;t=PHP%20Screen%20Scraping%20Class" title="Furl"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/furl.png" title="Furl" alt="Furl" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.newsvine.com/_tools/seed&amp;save?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;h=PHP%20Screen%20Scraping%20Class" title="NewsVine"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/newsvine.png" title="NewsVine" alt="NewsVine" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.simpy.com/simpy/LinkAdd.do?href=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class" title="Simpy"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/simpy.png" title="Simpy" alt="Simpy" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://slashdot.org/bookmark.pl?title=PHP%20Screen%20Scraping%20Class&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F" title="Slashdot"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/slashdot.png" title="Slashdot" alt="Slashdot" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.spurl.net/spurl.php?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class" title="Spurl"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/spurl.png" title="Spurl" alt="Spurl" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.stumbleupon.com/submit?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class" title="StumbleUpon"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/stumbleupon.png" title="StumbleUpon" alt="StumbleUpon" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://myweb2.search.yahoo.com/myresults/bookmarklet?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;=PHP%20Screen%20Scraping%20Class" title="YahooMyWeb"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/yahoomyweb.png" title="YahooMyWeb" alt="YahooMyWeb" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://tailrank.com/share/?text=&amp;link_href=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class" title="TailRank"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/tailrank.png" title="TailRank" alt="TailRank" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://technorati.com/faves?add=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F" title="Technorati"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/technorati.png" title="Technorati" alt="Technorati" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.facebook.com/share.php?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;t=PHP%20Screen%20Scraping%20Class" title="Facebook"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/facebook.png" title="Facebook" alt="Facebook" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.google.com/bookmarks/mark?op=edit&amp;bkmk=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class" title="Google"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/googlebookmark.png" title="Google" alt="Google" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.linkedin.com/shareArticle?mini=true&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class&amp;source=BRADINO+LAMP+Development+Tutorials%2C+Code%2C+Tips+%26amp%3B+Tricks&amp;summary=After%20some%20positive%20feedback%20I%20have%20decided%20to%20continue%20to%20develop%20the%20PHP%20Screen%20Scraping%20class.%20This%20post%20will%20server%20as%20the%20permanent%20home%20for%20the%20class.%0D%0A%0D%0ADownload%20PHP%20Screen%20Scraping%20Class%0D%0A%0D%0AUpdates%0D%0A%0D%0A%2020009-07-30%20Added%20setHeader%28%29%20function%0D%0A" title="LinkedIn"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/linkedin.png" title="LinkedIn" alt="LinkedIn" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="https://favorites.live.com/quickadd.aspx?marklet=1&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class" title="Live"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/live.png" title="Live" alt="Live" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.myspace.com/Modules/PostTo/Pages/?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;t=PHP%20Screen%20Scraping%20Class" title="MySpace"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/myspace.png" title="MySpace" alt="MySpace" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://ping.fm/ref/?link=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;title=PHP%20Screen%20Scraping%20Class" title="Ping.fm"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/ping.gif" title="Ping.fm" alt="Ping.fm" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://buzz.yahoo.com/submit/?submitUrl=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F&amp;submitHeadline=PHP%20Screen%20Scraping%20Class&amp;submitSummary=After%20some%20positive%20feedback%20I%20have%20decided%20to%20continue%20to%20develop%20the%20PHP%20Screen%20Scraping%20class.%20This%20post%20will%20server%20as%20the%20permanent%20home%20for%20the%20class.%0D%0A%0D%0ADownload%20PHP%20Screen%20Scraping%20Class%0D%0A%0D%0AUpdates%0D%0A%0D%0A%2020009-07-30%20Added%20setHeader%28%29%20function%0D%0A&amp;submitCategory=science&amp;submitAssetType=text" title="Yahoo! Buzz"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/yahoobuzz.gif" title="Yahoo! Buzz" alt="Yahoo! Buzz" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="mailto:?subject=PHP%20Screen%20Scraping%20Class&amp;body=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping-class%2F" title="E-mail this story to a friend!"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/email_link.png" title="E-mail this story to a friend!" alt="E-mail this story to a friend!" class="sociable-hovers" /></a></li>
</ul>
</div>
]]></content:encoded>
			<wfw:commentRss>http://www.bradino.com/php/php-screen-scraping-class/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
		</item>
		<item>
		<title>Screen Scraping Twitter</title>
		<link>http://www.bradino.com/php/screen-scraping-twitter/</link>
		<comments>http://www.bradino.com/php/screen-scraping-twitter/#comments</comments>
		<pubDate>Sat, 28 Mar 2009 17:20:22 +0000</pubDate>
		<dc:creator>BRADINO</dc:creator>
				<category><![CDATA[PHP]]></category>
		<category><![CDATA[Screen Scraping]]></category>

		<guid isPermaLink="false">http://www.bradino.com/?p=4845</guid>
		<description><![CDATA[I got an email today asking for help to scrape Twitter. In particular, to be able to login. So I am going to show everyone, NOT to encourage anyone to violate Twitters terms of use but as an educational blog post about how PHP and cURL can be used to post variables and store cookies.
Again, [...]]]></description>
			<content:encoded><![CDATA[<p>I got an email today asking for help to scrape Twitter. In particular, to be able to login. So I am going to show everyone, NOT to encourage anyone to violate Twitters terms of use but as an educational blog post about how PHP and cURL can be used to post variables and store cookies.</p>
<p>Again, I am using the cScrape class I wrote, which you can <a href="http://www.bradino.com/downloads/cScrape.txt">download</a>. </p>
<p><strong>Step 1</strong><br />
First go to twitter.com and look at the source code of the login to get the form field names and the form post location. You will see that the form posts to https://twitter.com/session and the username and password fields are session[username_or_email] and session[password] respectively.</p>
<p><strong>Step 2</strong><br />
Now you are ready to login. So using the fetch function in the Scrape class you create an associative array to contain the form values you want to post. The other thing you will need to do is uncomment the lines for CURLOPT_COOKIEFILE and CURLOPT_COOKIEJAR. Cookies will be required to stay logged in and scrape around. The paths to the cookie files need to be writable by your app. Also you will need to uncomment the line about CURLOPT_FOLLOWLOCATION. </p>
<div class="syntax_hilite">
<div id="php-3">
<div class="php"><span style="color:#0000FF;">$data</span> = <a href="http://www.bradino.com/php-functions/array/"><span style="color:#000066;">array</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'session[username_or_email]'</span> =&gt; <span style="color:#FF0000;">"bradino"</span>, <span style="color:#FF0000;">'session[password]'</span> =&gt; <span style="color:#FF0000;">"secret"</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetch</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'https://twitter.com/sessions'</span>,<span style="color:#0000FF;">$data</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</div>
</div>
</div>
<p></p>
<p><strong>Step 1.5</strong><br />
Oops that didn't work. All I got back was <em>403 Forbidden: The server understood the request, but is refusing to fulfill it</em>. Ahhh I see another variable called authenticity_token I bet Twitter was looking for that. So let's back up and first hit twitter.com to get the authenticity_token variable, and then make the login post request with that variable included in our array of parameters.</p>
<div class="syntax_hilite">
<div id="php-4">
<div class="php"><span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetch</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'https://twitter.com'</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><span style="color:#0000FF;">$data</span> = <a href="http://www.bradino.com/php-functions/array/"><span style="color:#000066;">array</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'session[username_or_email]'</span> =&gt; <span style="color:#FF0000;">"bradino"</span>, <span style="color:#FF0000;">'session[password]'</span> =&gt; <span style="color:#FF0000;">"secret"</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><span style="color:#0000FF;">$data</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#FF0000;">'authenticity_token'</span><span style="color:#006600; font-weight:bold;">&#93;</span> = <span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetchBetween</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'name=&quot;authenticity_token&quot; type=&quot;hidden&quot; value=&quot;'</span>,<span style="color:#FF0000;">'&quot;'</span>,<span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">result</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetch</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'https://twitter.com/sessions'</span>,<span style="color:#0000FF;">$data</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><a href="http://www.bradino.com/php-functions/echo/"><span style="color:#000066;">echo</span></a> <span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">result</span>;</div>
</div>
</div>
<p></p>
<p>So that's basically it. Now you are logged in and can scrape around and request other pages as you normally would. Sorry it wasn't a longer post. I really do enjoy this kind of stuff so if anyone has a request, hit me up.</p>
<p><strong>Errors?</strong><br />
1) Make sure that you are properly parsing the token variable<br />
2) Make sure that you uncommented the lines about CURLOPT_COOKIEFILE and CURLOPT_COOKIEJAR, those options need to be enabled and be sure the path set is writable by your application<br />
3) Make sure that the path to the cookie file is writable and that it is getting data written to it<br />
4) If you get a message about being redirected you need to uncomment the line about CURLOPT_FOLLOWLOCATION, that option needs to be enabled true</p>

<div class="sociable">

<ul>
	<li><a rel="nofollow" target="_blank" href="http://digg.com/submit?phase=2&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter" title="Digg"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/digg.png" title="Digg" alt="Digg" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://twitter.com/home?status=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F" title="TwitThis"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/twitter.gif" title="TwitThis" alt="TwitThis" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://del.icio.us/post?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter" title="del.icio.us"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/delicious.png" title="del.icio.us" alt="del.icio.us" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.netvouz.com/action/submitBookmark?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter&amp;popup=no" title="Netvouz"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/netvouz.png" title="Netvouz" alt="Netvouz" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.dzone.com/links/add.html?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter" title="description"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/dzone.png" title="description" alt="description" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://reddit.com/submit?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter" title="Reddit"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/reddit.png" title="Reddit" alt="Reddit" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.furl.net/storeIt.jsp?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;t=Screen%20Scraping%20Twitter" title="Furl"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/furl.png" title="Furl" alt="Furl" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.newsvine.com/_tools/seed&amp;save?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;h=Screen%20Scraping%20Twitter" title="NewsVine"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/newsvine.png" title="NewsVine" alt="NewsVine" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.simpy.com/simpy/LinkAdd.do?href=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter" title="Simpy"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/simpy.png" title="Simpy" alt="Simpy" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://slashdot.org/bookmark.pl?title=Screen%20Scraping%20Twitter&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F" title="Slashdot"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/slashdot.png" title="Slashdot" alt="Slashdot" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.spurl.net/spurl.php?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter" title="Spurl"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/spurl.png" title="Spurl" alt="Spurl" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.stumbleupon.com/submit?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter" title="StumbleUpon"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/stumbleupon.png" title="StumbleUpon" alt="StumbleUpon" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://myweb2.search.yahoo.com/myresults/bookmarklet?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;=Screen%20Scraping%20Twitter" title="YahooMyWeb"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/yahoomyweb.png" title="YahooMyWeb" alt="YahooMyWeb" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://tailrank.com/share/?text=&amp;link_href=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter" title="TailRank"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/tailrank.png" title="TailRank" alt="TailRank" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://technorati.com/faves?add=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F" title="Technorati"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/technorati.png" title="Technorati" alt="Technorati" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.facebook.com/share.php?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;t=Screen%20Scraping%20Twitter" title="Facebook"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/facebook.png" title="Facebook" alt="Facebook" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.google.com/bookmarks/mark?op=edit&amp;bkmk=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter" title="Google"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/googlebookmark.png" title="Google" alt="Google" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.linkedin.com/shareArticle?mini=true&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter&amp;source=BRADINO+LAMP+Development+Tutorials%2C+Code%2C+Tips+%26amp%3B+Tricks&amp;summary=I%20got%20an%20email%20today%20asking%20for%20help%20to%20scrape%20Twitter.%20In%20particular%2C%20to%20be%20able%20to%20login.%20So%20I%20am%20going%20to%20show%20everyone%2C%20NOT%20to%20encourage%20anyone%20to%20violate%20Twitters%20terms%20of%20use%20but%20as%20an%20educational%20blog%20post%20about%20how%20PHP%20and%20cURL%20can%20be%20used%20to" title="LinkedIn"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/linkedin.png" title="LinkedIn" alt="LinkedIn" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="https://favorites.live.com/quickadd.aspx?marklet=1&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter" title="Live"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/live.png" title="Live" alt="Live" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.myspace.com/Modules/PostTo/Pages/?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;t=Screen%20Scraping%20Twitter" title="MySpace"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/myspace.png" title="MySpace" alt="MySpace" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://ping.fm/ref/?link=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;title=Screen%20Scraping%20Twitter" title="Ping.fm"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/ping.gif" title="Ping.fm" alt="Ping.fm" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://buzz.yahoo.com/submit/?submitUrl=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F&amp;submitHeadline=Screen%20Scraping%20Twitter&amp;submitSummary=I%20got%20an%20email%20today%20asking%20for%20help%20to%20scrape%20Twitter.%20In%20particular%2C%20to%20be%20able%20to%20login.%20So%20I%20am%20going%20to%20show%20everyone%2C%20NOT%20to%20encourage%20anyone%20to%20violate%20Twitters%20terms%20of%20use%20but%20as%20an%20educational%20blog%20post%20about%20how%20PHP%20and%20cURL%20can%20be%20used%20to&amp;submitCategory=science&amp;submitAssetType=text" title="Yahoo! Buzz"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/yahoobuzz.gif" title="Yahoo! Buzz" alt="Yahoo! Buzz" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="mailto:?subject=Screen%20Scraping%20Twitter&amp;body=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping-twitter%2F" title="E-mail this story to a friend!"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/email_link.png" title="E-mail this story to a friend!" alt="E-mail this story to a friend!" class="sociable-hovers" /></a></li>
</ul>
</div>
]]></content:encoded>
			<wfw:commentRss>http://www.bradino.com/php/screen-scraping-twitter/feed/</wfw:commentRss>
		<slash:comments>6</slash:comments>
		</item>
		<item>
		<title>PHP Screen Scraping</title>
		<link>http://www.bradino.com/php/php-screen-scraping/</link>
		<comments>http://www.bradino.com/php/php-screen-scraping/#comments</comments>
		<pubDate>Mon, 02 Jun 2008 18:20:47 +0000</pubDate>
		<dc:creator>BRADINO</dc:creator>
				<category><![CDATA[PHP]]></category>
		<category><![CDATA[Screen Scraping]]></category>

		<guid isPermaLink="false">http://www.bradino.com/php/php-screen-scraping/</guid>
		<description><![CDATA[I had a request recently for help with scraping a little content from http://www.newyork411.com :)
So here we go. This time I made a quick PHP class with some basic functions to grab the source fo the page as well as fetchBetween, fetchAfter, fetchAll, etc. You can get the latest version of the class at http://www.bradino.com/downloads/cScrape.txt [...]]]></description>
			<content:encoded><![CDATA[<p>I had a request recently for help with scraping a little content from http://www.newyork411.com :)</p>
<p>So here we go. This time I made a quick PHP class with some basic functions to grab the source fo the page as well as fetchBetween, fetchAfter, fetchAll, etc. You can get the latest version of the class at http://www.bradino.com/downloads/cScrape.txt - be sure to rename it to cScrape.php. If there is an interest I can continue to develop this class a tool for screens craping with PHP.</p>
<p>Anyway so here we go scraping all the companies from this page http://www.newyork411.com/Ad_Agencies_Production_Companies/category-cid-50553.htm as well as the details of each company, found by clicking on the company.</p>
<p><strong>Step 1 - Initialize the class and fetch the page:</strong></p>
<div class="syntax_hilite">
<div id="php-8">
<div class="php"><span style="color:#616100;">include</span> <span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'cScrape.php'</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><span style="color:#0000FF;">$scrape</span> = <span style="color:#000000; font-weight:bold;">new</span> Scrape<span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><span style="color:#0000FF;">$url</span> = <span style="color:#FF0000;">'http://www.newyork411.com/Ad_Agencies_Production_Companies/category-cid-50553.htm'</span>;</p>
<p><span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetch</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$url</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><span style="color:#0000FF;">$data</span> = <span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">removeNewlines</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">result</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</div>
</div>
</div>
<p></p>
<p><strong>Step 2 - find your anchor and get the chunk of html that contains what you want</strong></p>
<div class="syntax_hilite">
<div id="php-9">
<div class="php"><span style="color:#0000FF;">$data</span> = <span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetchBetween</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'&lt;table width=&quot;490&quot; border=&quot;0&quot; cellpadding=&quot;3&quot;'</span>,<span style="color:#FF0000;">'&lt;/table&gt;'</span>,<span style="color:#0000FF;">$data</span>,<span style="color:#000000; font-weight:bold;">true</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><span style="color:#0000FF;">$rows</span> = <span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetchAllBetween</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'&lt;TR'</span>,<span style="color:#FF0000;">'&lt;/tr&gt;'</span>,<span style="color:#0000FF;">$data</span>,<span style="color:#000000; font-weight:bold;">true</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</div>
</div>
</div>
<p></p>
<p><strong>Step 3 - parse out the individual values and print out the first record for demo</strong></p>
<div class="syntax_hilite">
<div id="php-10">
<div class="php"><span style="color:#616100;">foreach</span> <span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$rows</span> <span style="color:#616100;">as</span> <span style="color:#0000FF;">$id</span> =&gt; <span style="color:#0000FF;">$row</span><span style="color:#006600; font-weight:bold;">&#41;</span><span style="color:#006600; font-weight:bold;">&#123;</span><br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$record</span> = <a href="http://www.bradino.com/php-functions/array/"><span style="color:#000066;">array</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$cells</span> = <span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetchAllBetween</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'&lt;td'</span>,<span style="color:#FF0000;">'&lt;/td&gt;'</span>,<span style="color:#0000FF;">$row</span>,<span style="color:#000000; font-weight:bold;">true</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$record</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#FF0000;">'company'</span><span style="color:#006600; font-weight:bold;">&#93;</span> = <a href="http://www.bradino.com/php-functions/strip_tags/"><span style="color:#000066;">strip_tags</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$cells</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">1</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$url</span> = <span style="color:#FF0000;">'http://www.newyork411.com'</span> . <span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetchBetween</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'&lt;a href=&quot;'</span>,<span style="color:#FF0000;">'&quot;&gt;'</span>,<span style="color:#0000FF;">$cells</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">1</span><span style="color:#006600; font-weight:bold;">&#93;</span>,<span style="color:#000000; font-weight:bold;">false</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$url</span> = <a href="http://www.bradino.com/php-functions/str_replace/"><span style="color:#000066;">str_replace</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">' '</span>,<span style="color:#FF0000;">'%20'</span>,<span style="color:#0000FF;">$url</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetch</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$url</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$data2</span> = <span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">removeNewlines</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">result</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$data2</span> = <span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetchBetween</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'&lt;div id=&quot;tabText&quot;&gt;'</span>,<span style="color:#FF0000;">'&lt;/div&gt;'</span>,<span style="color:#0000FF;">$data2</span>,<span style="color:#000000; font-weight:bold;">true</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$data2</span> = <span style="color:#0000FF;">$scrape</span>-&gt;<span style="color:#006600;">fetchAfter</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'&lt;/table&gt;'</span>,<span style="color:#0000FF;">$data2</span>,<span style="color:#000000; font-weight:bold;">false</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p>&nbsp; &nbsp; <span style="color:#0000FF;">$details</span> = <a href="http://www.bradino.com/php-functions/explode/"><span style="color:#000066;">explode</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'&lt;br /&gt;'</span>,<span style="color:#0000FF;">$data2</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$record</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#FF0000;">'address'</span><span style="color:#006600; font-weight:bold;">&#93;</span> = <span style="color:#0000FF;">$details</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">0</span><span style="color:#006600; font-weight:bold;">&#93;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$location</span> = <a href="http://www.bradino.com/php-functions/explode/"><span style="color:#000066;">explode</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">','</span>,<span style="color:#0000FF;">$details</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">1</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$record</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#FF0000;">'city'</span><span style="color:#006600; font-weight:bold;">&#93;</span> = <a href="http://www.bradino.com/php-functions/trim/"><span style="color:#000066;">trim</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$location</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">0</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$location</span> = <a href="http://www.bradino.com/php-functions/explode/"><span style="color:#000066;">explode</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">' '</span>,<a href="http://www.bradino.com/php-functions/trim/"><span style="color:#000066;">trim</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$location</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">1</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#41;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$record</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#FF0000;">'state'</span><span style="color:#006600; font-weight:bold;">&#93;</span> = <a href="http://www.bradino.com/php-functions/trim/"><span style="color:#000066;">trim</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$location</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">0</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#0000FF;">$record</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#FF0000;">'zip'</span><span style="color:#006600; font-weight:bold;">&#93;</span> = <a href="http://www.bradino.com/php-functions/trim/"><span style="color:#000066;">trim</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$location</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">1</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#616100;">for</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$i</span>=<span style="color:#CC66CC;">2</span>; <span style="color:#0000FF;">$i</span>&lt;=<span style="color:#CC66CC;">5</span>; <span style="color:#0000FF;">$i</span>++<span style="color:#006600; font-weight:bold;">&#41;</span><span style="color:#006600; font-weight:bold;">&#123;</span><br />
&nbsp; &nbsp; &nbsp; &nbsp; <br />
&nbsp; &nbsp; &nbsp; &nbsp; <span style="color:#0000FF;">$detail</span> = <a href="http://www.bradino.com/php-functions/trim/"><span style="color:#000066;">trim</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$details</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#0000FF;">$i</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; &nbsp; &nbsp; <br />
&nbsp; &nbsp; &nbsp; &nbsp; <span style="color:#616100;">if</span><span style="color:#006600; font-weight:bold;">&#40;</span><a href="http://www.bradino.com/php-functions/substr/"><span style="color:#000066;">substr</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$detail</span>,<span style="color:#CC66CC;">0</span>,<span style="color:#CC66CC;">6</span><span style="color:#006600; font-weight:bold;">&#41;</span>==<span style="color:#FF0000;">'Phone:'</span><span style="color:#006600; font-weight:bold;">&#41;</span> <span style="color:#0000FF;">$record</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#FF0000;">'phone'</span><span style="color:#006600; font-weight:bold;">&#93;</span> = <a href="http://www.bradino.com/php-functions/str_replace/"><span style="color:#000066;">str_replace</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'Phone: '</span>,<span style="color:#FF0000;">''</span>,<span style="color:#0000FF;">$detail</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; &nbsp; &nbsp; <br />
&nbsp; &nbsp; &nbsp; &nbsp; <span style="color:#616100;">else</span> <span style="color:#616100;">if</span><span style="color:#006600; font-weight:bold;">&#40;</span><a href="http://www.bradino.com/php-functions/substr/"><span style="color:#000066;">substr</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$detail</span>,<span style="color:#CC66CC;">0</span>,<span style="color:#CC66CC;">4</span><span style="color:#006600; font-weight:bold;">&#41;</span>==<span style="color:#FF0000;">'Fax:'</span><span style="color:#006600; font-weight:bold;">&#41;</span> <span style="color:#0000FF;">$record</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#FF0000;">'fax'</span><span style="color:#006600; font-weight:bold;">&#93;</span> = <a href="http://www.bradino.com/php-functions/str_replace/"><span style="color:#000066;">str_replace</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'Fax: '</span>,<span style="color:#FF0000;">''</span>,<span style="color:#0000FF;">$detail</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; &nbsp; &nbsp; <br />
&nbsp; &nbsp; &nbsp; &nbsp; <span style="color:#616100;">else</span> <span style="color:#616100;">if</span><span style="color:#006600; font-weight:bold;">&#40;</span><a href="http://www.bradino.com/php-functions/substr/"><span style="color:#000066;">substr</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$detail</span>,<span style="color:#CC66CC;">0</span>,<span style="color:#CC66CC;">4</span><span style="color:#006600; font-weight:bold;">&#41;</span>==<span style="color:#FF0000;">'Web:'</span><span style="color:#006600; font-weight:bold;">&#41;</span> <span style="color:#0000FF;">$record</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#FF0000;">'web'</span><span style="color:#006600; font-weight:bold;">&#93;</span> = <a href="http://www.bradino.com/php-functions/strip_tags/"><span style="color:#000066;">strip_tags</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><a href="http://www.bradino.com/php-functions/str_replace/"><span style="color:#000066;">str_replace</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'Web: '</span>,<span style="color:#FF0000;">''</span>,<span style="color:#0000FF;">$detail</span><span style="color:#006600; font-weight:bold;">&#41;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; &nbsp; &nbsp; <br />
&nbsp; &nbsp; &nbsp; &nbsp; <span style="color:#616100;">else</span> <span style="color:#616100;">if</span><span style="color:#006600; font-weight:bold;">&#40;</span><a href="http://www.bradino.com/php-functions/substr/"><span style="color:#000066;">substr</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$detail</span>,<span style="color:#CC66CC;">0</span>,<span style="color:#CC66CC;">6</span><span style="color:#006600; font-weight:bold;">&#41;</span>==<span style="color:#FF0000;">'Email:'</span><span style="color:#006600; font-weight:bold;">&#41;</span> <span style="color:#0000FF;">$record</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#FF0000;">'email'</span><span style="color:#006600; font-weight:bold;">&#93;</span> = <a href="http://www.bradino.com/php-functions/strip_tags/"><span style="color:#000066;">strip_tags</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><a href="http://www.bradino.com/php-functions/str_replace/"><span style="color:#000066;">str_replace</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">'Email: '</span>,<span style="color:#FF0000;">''</span>,<span style="color:#0000FF;">$detail</span><span style="color:#006600; font-weight:bold;">&#41;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; &nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#006600; font-weight:bold;">&#125;</span><br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <a href="http://www.bradino.com/php-functions/print_r/"><span style="color:#000066;">print_r</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$record</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <a href="http://www.bradino.com/php-functions/die/"><span style="color:#000066;">die</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
<span style="color:#006600; font-weight:bold;">&#125;</span></div>
</div>
</div>
<p></p>
<div class="awmp_tags"><a href="http://www.bradino.com/search/php scrape/" rel="tag">php scrape</a> <a href="http://www.bradino.com/search/screen scraping/" rel="tag">screen scraping</a> <a href="http://www.bradino.com/search/php screen scrape/" rel="tag">php screen scrape</a></div>
<div class="sociable">

<ul>
	<li><a rel="nofollow" target="_blank" href="http://digg.com/submit?phase=2&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20" title="Digg"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/digg.png" title="Digg" alt="Digg" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://twitter.com/home?status=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F" title="TwitThis"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/twitter.gif" title="TwitThis" alt="TwitThis" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://del.icio.us/post?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20" title="del.icio.us"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/delicious.png" title="del.icio.us" alt="del.icio.us" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.netvouz.com/action/submitBookmark?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20&amp;popup=no" title="Netvouz"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/netvouz.png" title="Netvouz" alt="Netvouz" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.dzone.com/links/add.html?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20" title="description"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/dzone.png" title="description" alt="description" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://reddit.com/submit?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20" title="Reddit"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/reddit.png" title="Reddit" alt="Reddit" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.furl.net/storeIt.jsp?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;t=PHP%20Screen%20Scraping%20" title="Furl"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/furl.png" title="Furl" alt="Furl" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.newsvine.com/_tools/seed&amp;save?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;h=PHP%20Screen%20Scraping%20" title="NewsVine"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/newsvine.png" title="NewsVine" alt="NewsVine" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.simpy.com/simpy/LinkAdd.do?href=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20" title="Simpy"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/simpy.png" title="Simpy" alt="Simpy" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://slashdot.org/bookmark.pl?title=PHP%20Screen%20Scraping%20&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F" title="Slashdot"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/slashdot.png" title="Slashdot" alt="Slashdot" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.spurl.net/spurl.php?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20" title="Spurl"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/spurl.png" title="Spurl" alt="Spurl" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.stumbleupon.com/submit?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20" title="StumbleUpon"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/stumbleupon.png" title="StumbleUpon" alt="StumbleUpon" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://myweb2.search.yahoo.com/myresults/bookmarklet?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;=PHP%20Screen%20Scraping%20" title="YahooMyWeb"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/yahoomyweb.png" title="YahooMyWeb" alt="YahooMyWeb" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://tailrank.com/share/?text=&amp;link_href=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20" title="TailRank"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/tailrank.png" title="TailRank" alt="TailRank" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://technorati.com/faves?add=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F" title="Technorati"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/technorati.png" title="Technorati" alt="Technorati" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.facebook.com/share.php?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;t=PHP%20Screen%20Scraping%20" title="Facebook"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/facebook.png" title="Facebook" alt="Facebook" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.google.com/bookmarks/mark?op=edit&amp;bkmk=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20" title="Google"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/googlebookmark.png" title="Google" alt="Google" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.linkedin.com/shareArticle?mini=true&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20&amp;source=BRADINO+LAMP+Development+Tutorials%2C+Code%2C+Tips+%26amp%3B+Tricks&amp;summary=I%20had%20a%20request%20recently%20for%20help%20with%20scraping%20a%20little%20content%20from%20http%3A%2F%2Fwww.newyork411.com%20%3A%29%0D%0A%0D%0ASo%20here%20we%20go.%20This%20time%20I%20made%20a%20quick%20PHP%20class%20with%20some%20basic%20functions%20to%20grab%20the%20source%20fo%20the%20page%20as%20well%20as%20fetchBetween%2C%20fetchAfter%2C%20fetc" title="LinkedIn"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/linkedin.png" title="LinkedIn" alt="LinkedIn" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="https://favorites.live.com/quickadd.aspx?marklet=1&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20" title="Live"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/live.png" title="Live" alt="Live" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.myspace.com/Modules/PostTo/Pages/?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;t=PHP%20Screen%20Scraping%20" title="MySpace"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/myspace.png" title="MySpace" alt="MySpace" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://ping.fm/ref/?link=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20" title="Ping.fm"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/ping.gif" title="Ping.fm" alt="Ping.fm" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://buzz.yahoo.com/submit/?submitUrl=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F&amp;submitHeadline=PHP%20Screen%20Scraping%20&amp;submitSummary=I%20had%20a%20request%20recently%20for%20help%20with%20scraping%20a%20little%20content%20from%20http%3A%2F%2Fwww.newyork411.com%20%3A%29%0D%0A%0D%0ASo%20here%20we%20go.%20This%20time%20I%20made%20a%20quick%20PHP%20class%20with%20some%20basic%20functions%20to%20grab%20the%20source%20fo%20the%20page%20as%20well%20as%20fetchBetween%2C%20fetchAfter%2C%20fetc&amp;submitCategory=science&amp;submitAssetType=text" title="Yahoo! Buzz"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/yahoobuzz.gif" title="Yahoo! Buzz" alt="Yahoo! Buzz" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="mailto:?subject=PHP%20Screen%20Scraping%20&amp;body=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fphp-screen-scraping%2F" title="E-mail this story to a friend!"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/email_link.png" title="E-mail this story to a friend!" alt="E-mail this story to a friend!" class="sociable-hovers" /></a></li>
</ul>
</div>
]]></content:encoded>
			<wfw:commentRss>http://www.bradino.com/php/php-screen-scraping/feed/</wfw:commentRss>
		<slash:comments>16</slash:comments>
		</item>
		<item>
		<title>PHP Screen Scraping Tutorial</title>
		<link>http://www.bradino.com/php/screen-scraping/</link>
		<comments>http://www.bradino.com/php/screen-scraping/#comments</comments>
		<pubDate>Tue, 20 Nov 2007 04:21:24 +0000</pubDate>
		<dc:creator>BRADINO</dc:creator>
				<category><![CDATA[PHP]]></category>
		<category><![CDATA[Screen Scraping]]></category>

		<guid isPermaLink="false">http://www.bradino.com/php/screen-scraping/</guid>
		<description><![CDATA[UPDATE: New Screen Scraping Post
Screen Scraping is a great skill that every PHP developer should have experience with. Basically it involves scraping the source code of a web page, getting it into a string, and then parsing out the parts that you want to use. A simple application of screen scraping could be to build [...]]]></description>
			<content:encoded><![CDATA[<p>UPDATE: <a href="http://www.bradino.com/php/php-screen-scraping/">New Screen Scraping Post</a></p>
<p>Screen Scraping is a great skill that every PHP developer should have experience with. Basically it involves scraping the source code of a web page, getting it into a string, and then parsing out the parts that you want to use. A simple application of screen scraping could be to build a database of all the NFL teams complete with player details. </p>
<p>What the heck, let's do it... The first step is to get the page HTML into a PHP variable. This is super easy if the page is publicly accessible via a URL - no login or form post required to access... For more complex scraping you can use cURL to get the html source of the page but the rest of the process would be about the same. Anyway, let's scrape the site.</p>
<div class="syntax_hilite">
<div id="php-15">
<div class="php"><span style="color:#0000FF;">$url</span> = <span style="color:#FF0000;">"http://www.nfl.com/teams/sandiegochargers/roster?team=SD"</span>;</p>
<p><span style="color:#0000FF;">$raw</span> = <a href="http://www.bradino.com/php-functions/file_get_contents/"><span style="color:#000066;">file_get_contents</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$url</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</div>
</div>
</div>
<p></p>
<p>The easiest way to do pattern matching I have found is without newlines. Here is how I remove them from the raw html before I start parsing out the data I want to scrape.</p>
<div class="syntax_hilite">
<div id="php-16">
<div class="php"><span style="color:#0000FF;">$newlines</span> = <a href="http://www.bradino.com/php-functions/array/"><span style="color:#000066;">array</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">"<span style="color:#000099; font-weight:bold;">\t</span>"</span>,<span style="color:#FF0000;">"<span style="color:#000099; font-weight:bold;">\n</span>"</span>,<span style="color:#FF0000;">"<span style="color:#000099; font-weight:bold;">\r</span>"</span>,<span style="color:#FF0000;">"<span style="color:#000099; font-weight:bold;">\x</span>20<span style="color:#000099; font-weight:bold;">\x</span>20"</span>,<span style="color:#FF0000;">"<span style="color:#000099; font-weight:bold;">\0</span>"</span>,<span style="color:#FF0000;">"<span style="color:#000099; font-weight:bold;">\x</span>0B"</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><span style="color:#0000FF;">$content</span> = <a href="http://www.bradino.com/php-functions/str_replace/"><span style="color:#000066;">str_replace</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$newlines</span>, <span style="color:#FF0000;">""</span>, <a href="http://www.bradino.com/php-functions/html_entity_decode/"><span style="color:#000066;">html_entity_decode</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$raw</span><span style="color:#006600; font-weight:bold;">&#41;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</div>
</div>
</div>
<p></p>
<p>So now you have the source code of the page as a string variable, you need to parse out the results. Tis is where each scraping application will differ. Depending on the page structure and what elements you want to retrieve, you will have to alter the regular expression matching. You can view the source and see that the roster data you want is in a nice table with class name "standard_table". I also notice that this class name is unique to the page. So the next step is to get the start and end string positions for this table, and then extract just the table from the content:</p>
<div class="syntax_hilite">
<div id="php-17">
<div class="php"><span style="color:#0000FF;">$start</span> = <a href="http://www.bradino.com/php-functions/strpos/"><span style="color:#000066;">strpos</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$content</span>,<span style="color:#FF0000;">'&lt;table cellpadding=&quot;2&quot; class=&quot;standard_table&quot;'</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><span style="color:#0000FF;">$end</span> = <a href="http://www.bradino.com/php-functions/strpos/"><span style="color:#000066;">strpos</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$content</span>,<span style="color:#FF0000;">'&lt;/table&gt;'</span>,<span style="color:#0000FF;">$start</span><span style="color:#006600; font-weight:bold;">&#41;</span> + <span style="color:#CC66CC;">8</span>;</p>
<p><span style="color:#0000FF;">$table</span> = <a href="http://www.bradino.com/php-functions/substr/"><span style="color:#000066;">substr</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$content</span>,<span style="color:#0000FF;">$start</span>,<span style="color:#0000FF;">$end</span>-<span style="color:#0000FF;">$start</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</div>
</div>
</div>
<p></p>
<p>Now we have just the table containing the roster data, and we need to parse out the rows and cells. The easiest way to do this is with preg_match_all. If this code is not clear, you can print_r and die() in the loop to see what the rows and cells arrays contain.</p>
<div class="syntax_hilite">
<div id="php-18">
<div class="php"><a href="http://www.bradino.com/php-functions/preg_match_all/"><span style="color:#000066;">preg_match_all</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">"|&lt;tr(.*)&lt;/tr&gt;|U"</span>,<span style="color:#0000FF;">$table</span>,<span style="color:#0000FF;">$rows</span><span style="color:#006600; font-weight:bold;">&#41;</span>;</p>
<p><span style="color:#616100;">foreach</span> <span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$rows</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">0</span><span style="color:#006600; font-weight:bold;">&#93;</span> <span style="color:#616100;">as</span> <span style="color:#0000FF;">$row</span><span style="color:#006600; font-weight:bold;">&#41;</span><span style="color:#006600; font-weight:bold;">&#123;</span></p>
<p>&nbsp; &nbsp; <span style="color:#616100;">if</span> <span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#006600; font-weight:bold;">&#40;</span><a href="http://www.bradino.com/php-functions/strpos/"><span style="color:#000066;">strpos</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$row</span>,<span style="color:#FF0000;">'&lt;th'</span><span style="color:#006600; font-weight:bold;">&#41;</span>===<span style="color:#000000; font-weight:bold;">false</span><span style="color:#006600; font-weight:bold;">&#41;</span><span style="color:#006600; font-weight:bold;">&#41;</span><span style="color:#006600; font-weight:bold;">&#123;</span><br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; &nbsp; &nbsp; <a href="http://www.bradino.com/php-functions/preg_match_all/"><span style="color:#000066;">preg_match_all</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#FF0000;">"|&lt;td(.*)&lt;/td&gt;|U"</span>,<span style="color:#0000FF;">$row</span>,<span style="color:#0000FF;">$cells</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; &nbsp; &nbsp; <br />
&nbsp; &nbsp; &nbsp; &nbsp; <span style="color:#0000FF;">$number</span> = <a href="http://www.bradino.com/php-functions/strip_tags/"><span style="color:#000066;">strip_tags</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$cells</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">0</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">0</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; &nbsp; &nbsp; <br />
&nbsp; &nbsp; &nbsp; &nbsp; <span style="color:#0000FF;">$name</span> = <a href="http://www.bradino.com/php-functions/strip_tags/"><span style="color:#000066;">strip_tags</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$cells</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">0</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">1</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; &nbsp; &nbsp; <br />
&nbsp; &nbsp; &nbsp; &nbsp; <span style="color:#0000FF;">$position</span> = <a href="http://www.bradino.com/php-functions/strip_tags/"><span style="color:#000066;">strip_tags</span></a><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#0000FF;">$cells</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">0</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#91;</span><span style="color:#CC66CC;">2</span><span style="color:#006600; font-weight:bold;">&#93;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;<br />
&nbsp; &nbsp; &nbsp; &nbsp; <br />
&nbsp; &nbsp; &nbsp; &nbsp; <a href="http://www.bradino.com/php-functions/echo/"><span style="color:#000066;">echo</span></a> <span style="color:#FF0000;">"{$position} - {$name} - Number {$number} &lt;br&gt;<span style="color:#000099; font-weight:bold;">\n</span>"</span>;<br />
&nbsp; &nbsp; <br />
&nbsp; &nbsp; <span style="color:#006600; font-weight:bold;">&#125;</span></p>
<p><span style="color:#006600; font-weight:bold;">&#125;</span></div>
</div>
</div>
<p></p>
<p>So now we have parsed all the data for a given team from the official NFL site. To do all the teams, wrap this in a loop and as a final step, write all the data to a database table and voila, you have a database of all team rosters for the NFL.</p>
<p>This simple scraping example is just to illustrate the basic concept. Also keep in mind that if the source structure of the page you want to scrape changes, you will need to adjust your pattern matching. You should always scrape the page once and save the results in a file, then read that file into your code for development testing to minimize the hits to the live server. My personal opinion is that anything that is publicly accessible via the internet should be able to be scraped. What is the difference if you were to copy and paste it, basically that is what you are doing but doing it programmatically. You can definitely get into trouble if you misuse some data that you scraped, you could probably violate copyrights or whatever. Please scrape responsibly :)</p>
<div class="awmp_tags"><a href="http://www.bradino.com/search/scraping/" rel="tag">scraping</a> <a href="http://www.bradino.com/search/screen scraping/" rel="tag">screen scraping</a> <a href="http://www.bradino.com/search/web scraping/" rel="tag">web scraping</a> <a href="http://www.bradino.com/search/website scraping/" rel="tag">website scraping</a> <a href="http://www.bradino.com/search/php scrape/" rel="tag">php scrape</a></div>
<div class="sociable">

<ul>
	<li><a rel="nofollow" target="_blank" href="http://digg.com/submit?phase=2&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial" title="Digg"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/digg.png" title="Digg" alt="Digg" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://twitter.com/home?status=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F" title="TwitThis"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/twitter.gif" title="TwitThis" alt="TwitThis" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://del.icio.us/post?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial" title="del.icio.us"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/delicious.png" title="del.icio.us" alt="del.icio.us" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.netvouz.com/action/submitBookmark?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial&amp;popup=no" title="Netvouz"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/netvouz.png" title="Netvouz" alt="Netvouz" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.dzone.com/links/add.html?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial" title="description"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/dzone.png" title="description" alt="description" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://reddit.com/submit?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial" title="Reddit"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/reddit.png" title="Reddit" alt="Reddit" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.furl.net/storeIt.jsp?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;t=PHP%20Screen%20Scraping%20Tutorial" title="Furl"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/furl.png" title="Furl" alt="Furl" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.newsvine.com/_tools/seed&amp;save?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;h=PHP%20Screen%20Scraping%20Tutorial" title="NewsVine"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/newsvine.png" title="NewsVine" alt="NewsVine" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.simpy.com/simpy/LinkAdd.do?href=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial" title="Simpy"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/simpy.png" title="Simpy" alt="Simpy" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://slashdot.org/bookmark.pl?title=PHP%20Screen%20Scraping%20Tutorial&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F" title="Slashdot"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/slashdot.png" title="Slashdot" alt="Slashdot" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.spurl.net/spurl.php?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial" title="Spurl"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/spurl.png" title="Spurl" alt="Spurl" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.stumbleupon.com/submit?url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial" title="StumbleUpon"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/stumbleupon.png" title="StumbleUpon" alt="StumbleUpon" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://myweb2.search.yahoo.com/myresults/bookmarklet?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;=PHP%20Screen%20Scraping%20Tutorial" title="YahooMyWeb"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/yahoomyweb.png" title="YahooMyWeb" alt="YahooMyWeb" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://tailrank.com/share/?text=&amp;link_href=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial" title="TailRank"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/tailrank.png" title="TailRank" alt="TailRank" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://technorati.com/faves?add=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F" title="Technorati"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/technorati.png" title="Technorati" alt="Technorati" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.facebook.com/share.php?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;t=PHP%20Screen%20Scraping%20Tutorial" title="Facebook"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/facebook.png" title="Facebook" alt="Facebook" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.google.com/bookmarks/mark?op=edit&amp;bkmk=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial" title="Google"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/googlebookmark.png" title="Google" alt="Google" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.linkedin.com/shareArticle?mini=true&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial&amp;source=BRADINO+LAMP+Development+Tutorials%2C+Code%2C+Tips+%26amp%3B+Tricks&amp;summary=UPDATE%3A%20New%20Screen%20Scraping%20Post%0D%0A%0D%0AScreen%20Scraping%20is%20a%20great%20skill%20that%20every%20PHP%20developer%20should%20have%20experience%20with.%20Basically%20it%20involves%20scraping%20the%20source%20code%20of%20a%20web%20page%2C%20getting%20it%20into%20a%20string%2C%20and%20then%20parsing%20out%20the%20parts%20that%20you" title="LinkedIn"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/linkedin.png" title="LinkedIn" alt="LinkedIn" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="https://favorites.live.com/quickadd.aspx?marklet=1&amp;url=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial" title="Live"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/live.png" title="Live" alt="Live" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://www.myspace.com/Modules/PostTo/Pages/?u=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;t=PHP%20Screen%20Scraping%20Tutorial" title="MySpace"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/myspace.png" title="MySpace" alt="MySpace" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://ping.fm/ref/?link=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;title=PHP%20Screen%20Scraping%20Tutorial" title="Ping.fm"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/ping.gif" title="Ping.fm" alt="Ping.fm" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="http://buzz.yahoo.com/submit/?submitUrl=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F&amp;submitHeadline=PHP%20Screen%20Scraping%20Tutorial&amp;submitSummary=UPDATE%3A%20New%20Screen%20Scraping%20Post%0D%0A%0D%0AScreen%20Scraping%20is%20a%20great%20skill%20that%20every%20PHP%20developer%20should%20have%20experience%20with.%20Basically%20it%20involves%20scraping%20the%20source%20code%20of%20a%20web%20page%2C%20getting%20it%20into%20a%20string%2C%20and%20then%20parsing%20out%20the%20parts%20that%20you&amp;submitCategory=science&amp;submitAssetType=text" title="Yahoo! Buzz"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/yahoobuzz.gif" title="Yahoo! Buzz" alt="Yahoo! Buzz" class="sociable-hovers" /></a></li>
	<li><a rel="nofollow" target="_blank" href="mailto:?subject=PHP%20Screen%20Scraping%20Tutorial&amp;body=http%3A%2F%2Fwww.bradino.com%2Fphp%2Fscreen-scraping%2F" title="E-mail this story to a friend!"><img src="http://www.bradino.com/wp-content/plugins/sociable/images/email_link.png" title="E-mail this story to a friend!" alt="E-mail this story to a friend!" class="sociable-hovers" /></a></li>
</ul>
</div>
]]></content:encoded>
			<wfw:commentRss>http://www.bradino.com/php/screen-scraping/feed/</wfw:commentRss>
		<slash:comments>22</slash:comments>
		</item>
	</channel>
</rss>
