Web Scraping using a Microcontroller

This program connects a Wiring or Arduino module to the internet through a Lantronix serial-to-ethernet converter (Xport, WiPort, or Micro). The microcontroller makes a TCP connection to a web server first. Once it’s got a connection, it sends an HTTP request for a web page. When the web page comes back, it parses the page for < and > symbols, and takes the string between them. Then it converts the string to an integer. It assumes the string is made only of numeric ASCII characters (0 – 9).

This program couldn’t parse an entire web page very easily, so it’s best used in conjunction with a web scraper PHP script like this one, which reads the AIRNow site and extracts the Air Quality Index into a single string like this:

< AQI: 54>

This program was written to make an air quality index meter out of an analog voltmeter.

The electrical connections to the microcontroller are as follows:

  • disconnected LED: Arduino digital I/O 6
  • connected LED: Arduino digital I/O 7
  • connecting LED: Arduino digital I/O 8
  • requesting LED: Arduino digital I/O 9
  • Lantronix module reset: Arduino digital I/O 10
  • Voltmeter: Arduino digital I/O 11. The voltmeter is controlled by using pulse width modulation (analogWrite() command on the Arduino).

The web scraper is written in PHP. Its code follows below the Arduino code.

Technorati Tags: ,


Arduino code:

/*
    Web Scraper using Lantronix
    Microcontroller is connected to a Lantronix Xport
    serial-to-ethernet device. This program connects
    to a HTTP server through the Xport, makes a HTTP GET
    request for a PHP script, and parses the returned string.

    Xport communicates at 9600-8-n-1 non-inverted (true) serial.

    By Tom Igoe, 13 April 2006
*/



// Defines for the Xport's status (used for staus variable):
#define disconnected 0
#define connected 1
#define connecting 2
#define requesting 3
#define reading 4

/*
  Note: Status LEDs correspond to status states, as follows:
 disconnected LED:  Arduino digital I/O 6
 connected LED:  Arduino digital I/O 7
 connecting LED:  Arduino digital I/O 8
 requesting LED:  Arduino digital I/O 9
 */


// Defines for I/O pins:
#define xportResetPin 10
#define meterPin 11        // I/O pin that the VU meter is on

// defines for voltmeter:
#define meterMax 165       // max value on the meter
#define meterScale 150      // my meter reads 0 - 150

// Define for clock tick interval, in ms:
#define interval 20

// variables:
int inByte= -1;          // incoming byte from serial RX
char inString[32];       // string for incoming serial data
int stringPos = 0;       // string index counter
int kilobytes = 0;       // number of kilobytes of mail

int status = 0;          // Xport's connection status
int secs = 0;            // second counter (used to sleep between checks)

void setup() {
  int i = 0;               // generic loop counter
  // set all status LED pins and Xport reset pin:
  for (i = 6; i < 11; i++) {
    pinMode(i, OUTPUT);
  }

  // set up reset LED pin as output:
  pinMode(13, OUTPUT);

  // start serial port, 9600 8-N-1:
  Serial.begin(9600);

  // blink reset LED and reset Xport:
  blink(3);
  resetXport();
}

void loop() {
  int i = 0;               // generic loop counter

  // set the status lights:
  for (i = 6; i < 10; i++) {
    if (status == i - 6) {
      digitalWrite(i, HIGH);
    } 
    else {
      digitalWrite(i, LOW);
    }
  }

  // if you're connected to the server,  make a HTTP call.  
  // If not, connect to the server:

  if(status == disconnected) {
    // attempt to connect to the server:
    xportConnect();
  } 

  if (status == connecting) {
    // read the serial port:
    if (Serial.available()) {
      inByte = Serial.read();
      // Serial.print(inByte, DEC);
      if (inByte == 67) {  // 'C' in ascii
        status = connected;    
      }
    }

  }
  if (status == connected) {
    // send HTTP GET request for CGI script:
    httpRequest();
  }  

  if (status == requesting) {
    // wait for bytes from server:
    // read the serial port:
    if (Serial.available()) {
      inByte = Serial.read();
      // If you get a "<", what follows is the air quality index:
      if (inByte == 60) {
        stringPos = 0;
        status = reading;
      }
    }
  }

  if (status == reading) {
    if (Serial.available()) {
      inByte = Serial.read();
      // Keep reading until you get a ">":
      if (inByte != 62) {
        // save only ASCII numeric characters:
        if ((inByte >= 48) && (inByte <= 57)){
          inString[stringPos] = inByte;
          stringPos++;
        }
      } 
      else {
        // convert the string to a numeric value:
        int airQuality = stringToNumber(inString, stringPos);
        // set the meter appropriately:
        setMeter(airQuality);
        status = disconnected;

        // wait 60 seconds before trying again:
        for (secs = 0; secs < 60; secs++) {
          delay(1000);
        }

        // reset Xport before next request:
        resetXport();
      }
    }
  }
}

void xportConnect() {
  //   send out the server address and 
  //   wait for a "C" byte to come back.
  //   fill in your server's numerical address below:
  Serial.print("C192.168.1.23/80\n");
  status = connecting;
}

void httpRequest() {
  int i = 0;               // generic loop counter
  inByte = -1;  
  stringPos = 0;
  //  Make HTTP GET request. Fill in the path to your version
  //  of the CGI script:
  Serial.print("GET /~username/scraper.php HTTP/1.1\n");
  delay(250);
  //  Fill in your server's name:
  Serial.print("HOST: www.myserver.com\n\n");
  status = requesting;
}

void setMeter(int desiredValue) {
  int airQualityValue = 0;
  // if the value won't peg the meter, convert it
  // to the meter scale and send it out:
  if (desiredValue <= meterScale) {
    airQualityValue = desiredValue * meterMax /meterScale;
    analogWrite(meterPin, airQualityValue); 
  } 
}

void printResults() {
  // this routine used in debugging only, to print out the results:
  Serial.print(" I got ");
  Serial.print(stringPos, DEC);
  if (stringPos > 0) {
    Serial.print(" bytes, total:  ");
    Serial.print(kilobytes, DEC); 
    Serial.print(" string: " );
    for (int i = 0; i<stringPos; i++) {
      Serial.print(inString[i], BYTE);
    }  
  }
  Serial.print("\n\n\n");
}

// Take the Xport's reset pin low to reset it:
void resetXport() {
  digitalWrite(xportResetPin, LOW);
  delay(50);
  digitalWrite(xportResetPin, HIGH);
  // pause to let Xport boot up:
  delay(2000);
}

// Blink the reset LED:
void blink(int howManyTimes) {
  int i;
  for (i=0; i< howManyTimes; i++) {
    digitalWrite(13, HIGH);
    delay(200);
    digitalWrite(13, LOW);
    delay(200);  
  }
}



/*
  This method converts a string if ASCII numbers to a decimal number.
 There's no error checking in it, so it can return mistakes
 if you give it non-numeric ASCII.
 When I wrote this program, the standard C lib was not part of
 Arduino, so I couldn't use atoi().
 */
long stringToNumber(char thisString[], int length) {
  int thisChar = 0;
  long value = 0;

  for (thisChar = length-1; thisChar >=0; thisChar--) {
    char thisByte = thisString[thisChar] - 48;
    value = value + powerOfTen(thisByte, (length-1)-thisChar);
  } 
  return value;
}

/*
  This method takes a number between 0 and 9, 
 and multiplies it by ten raised to a second number.
 */

long powerOfTen(char digit, int power) {
  long val = 1;
  if (power == 0) {
    return digit;
  } 
  else {
    for (int i = power; i >=1 ; i--) {
      val = 10 * val;
    }
    return digit * val;
  }
}

PHP code. Note that this scraper is specific to the site mentioned here. You would need to modify it to scrape a different site. Learn the methods from this one, don’t copy the code.

<?php
	/*
		Scrape AQI Page
		
		This program reads the Air Quality index page at
		
		and removes everything but the values for particulate air quality
		and ozone level.  It prints those two values in the following format:
		< particulate > followed by a linefeed character (ASCII 10)
		< ozone > followed by a linefeed character (ASCII 10)
		null character (ASCII 0)
		
	*/

	// url of the page with the air quality index data for New York City:
	$url = 'http://airnow.gov/index.cfm?action=airnow.showlocal&cityid=164'; 

    $readParticles = 0;			// flag telling you the next line is the particle value
    $readOzone = 0;				// flag telling you the next line is the ozone value
    $particles = -1;			// the particles value
    $ozone = -1;				// the ozone value
    
	// open the file at the URL for reading:
    $filePath = fopen ($url, "r");
    
    // as long as you haven't reached the end of the file
    while (!feof($filePath))
    {
     	// read one line at a time, and strip all HTML and PHP tags from the line:
     	$line = fgetss($filePath, 4096);

		// if the previous line was the "observed at line" preceding
		// the particle matter reading, then $readParticles = 1 and
		// you should get this line, trim everything but the number,
		// and save the result in $particles:
		if ($readParticles == 1) {
			$particles = trim($line);
			echo "< AQI: $particles>";
			$readParticles = 0;
		}
		
		// if the previous line was the "observed at line" preceding
		// the ozone  reading, then $readOzone = 1 and
		// you should get this line, trim everything but the number,
		// and save the result in $ozone:
		if ($readOzone == 1) {
			$ozone = trim($line);
			//echo "<$ozone>";
			$readOzone = 0;
		}
		
		// if the current line contains the substring "AQI observed at"
		// then the line following it is either the particle reading
		// or the ozone reading:
		if (preg_match('/AQI observed at /', $line)) {
			// if $particles == -1, you haven't gotten
			// a value for it yet, so the next line
			// till be the particle value:
			if ($particles == -1) {
				$readParticles = 1;
			}
			
			// if $particles > -1, you've gotten a value for it.
			// that means that the next line will be the
			// ozone reading, if you haven't already gotten
			// a reading for ozone (i.e. if $ozone == -1):
			if (($particles > -1) && ($ozone == -1)) {
				$readOzone = 1;
			}
		}
     }  
    // close the file at the URL, you're done: 
    fclose($filePath);
?>