Results 1 to 5 of 5
  1. #1
    Join Date
    Jan 2003
    Posts
    55

    Unanswered: Data extract!! HELPPP

    Can anyone tell me how I can extract all the details of professors from this page.

    http://155.69.224.75:8000/eeepeople/AcadStaff.asp

    The data is in an HTML table. I need to also parse the links by clicking on the professor names and getting further data from their respective home pages.
    As for now just getting table data is where i need help

    A code snippet would be great to get me started. The data extracted is to be used to populate tables of a sql server dbase which i know how to do.

    Please help
    cheers
    You try and try again..but then give up, there's no sense in being a complete fool about it!!!

  2. #2
    Join Date
    Nov 2003
    Posts
    65

    Re: Data extract!! HELPPP

    I would think you will need to use the HTML parser for perl to strip out the data fromt he source file for that webpage.


    Originally posted by shuchi
    Can anyone tell me how I can extract all the details of professors from this page.

    http://155.69.224.75:8000/eeepeople/AcadStaff.asp

    The data is in an HTML table. I need to also parse the links by clicking on the professor names and getting further data from their respective home pages.
    As for now just getting table data is where i need help

    A code snippet would be great to get me started. The data extracted is to be used to populate tables of a sql server dbase which i know how to do.

    Please help
    cheers

  3. #3
    Join Date
    Jun 2004
    Location
    Nowhere Near You
    Posts
    89
    Code:
    #!\user\bin\perl
    
    use File::Path;
    use HTML::TreeBuilder;
    use LWP::Simple;
    use URI;
    
      my(%h_HomePages);
    
      my($s_HTML);
      my($s_URL)="http://155.69.224.75:8000/eeepeople/AcadStaff.asp";
      $s_HTML=LWP::Simple::get("$s_URL");
      if (defined($s_HTML)) { # We have the data from the url
        my($o_Root)=HTML::TreeBuilder->new_from_content($s_HTML);
        my($o_Table);
        foreach $o_Table ($o_Root->look_down(
          sub{ return $_[0]->tag eq 'table' ? 1 : 0; }
           )) { # $o_Table is a complete table
          my($o_A);
          foreach $o_A ($o_Table->look_down(
            sub{ return $_[0]->tag eq 'a' ? 1 : 0; }
             )) {
            #print $o_A->as_HTML("<>&","",{}),"\n"; # <= note the arguments
            my($s_Name)=$o_A->as_text();
            my($s_Address)=$o_A->attr("href");
            $h_HomePages{$s_Name}=$o_A->attr("href") if (($s_Name =~ m| |) and ($s_Address =~ m|.htm$|));
             };
           };
        $o_Root->delete;
         };
      # Now %h_HomePages is a hash keyed on name value being his/her homepage
    Running the debugger:
    Code:
      DB<7> x %h_HomePages
    0  'Ng-Lim Jit Poh, Jessica'
    1  'http://www.ntu.edu.sg/eee/eee0/cv/ejlim.htm'
    2  'Er Meng Hwa'
    3  'http://www.ntu.edu.sg/eee/eee0/cv/emher.htm'
    4  'Heng Swee Hai, Micheal'
    5  'http://www.ntu.edu.sg/eee/eee0/cv/eshheng.htm'
    6  'Loke Wei Sue'
    7  'http://www.ntu.edu.sg/eee/eee0/cv/ewsloke.htm'
    8  'Kam Chan Hin'
    9  'http://www.ntu.edu.sg/eee/eee6/cv/echkam.htm'
    10  'Ho Woon Yee'
    11  'http://www.ntu.edu.sg/eee/eee0/cv/ewyho.htm'
    12  'Fang Woan Pin'
    13  'http://www.ntu.edu.sg/eee/eee0/cv/ewpfang.htm'
    14  'Lai Phooi Ching'
    15  'http://www.ntu.edu.sg/eee/eee0/cv/epclai.htm'
    16  'McClure, Joanne Wendy'
    17  'http://www.ntu.edu.sg/eee/eee0/cv/ejwmcclure.htm'
    18  'Lee Hwee Hoon'
    19  'http://www.ntu.edu.sg/eee/eee0/cv/ehhlee.htm'
    20  'Chan Ling Ling'
    21  'http://www.ntu.edu.sg/eee/eee0/cv/ellchan.htm'
    22  'Teoh Eam Khwang'
    23  'http://www.ntu.edu.sg/eee/eee4/cv/eekteoh.htm'
    24  'Yoon Soon Fatt'
    25  'http://www.ntu.edu.sg/eee/eee6/cv/esfyoon.htm'
    26  'Tay Beng Kang'
    27  'http://www.ntu.edu.sg/eee/eee6/cv/ebktay.htm'
      DB<8> q
    --- now take it from there, okay?

  4. #4
    Join Date
    Jun 2004
    Location
    Nowhere Near You
    Posts
    89

    Cool

    What the heck! Might as well fetch all those homepages while we're at it!
    Code:
    #!\user\bin\perl
    
    use File::Path;
    use HTML::TreeBuilder;
    use LWP::Simple;
    use URI;
    
    
      my($s_HTML);
      my($s_URL)="http://155.69.224.75:8000/eeepeople/AcadStaff.asp";
      $s_HTML=LWP::Simple::get("$s_URL");
      my(%h_HomePages);
      if (defined($s_HTML)) { # We have the data from the url
        my($o_Root)=HTML::TreeBuilder->new_from_content($s_HTML);
        my($o_Table);
        foreach $o_Table ($o_Root->look_down(
          sub{ return $_[0]->tag eq 'table' ? 1 : 0; }
           )) { # $o_Table is a complete table
          my($o_A);
          foreach $o_A ($o_Table->look_down(
            sub{ return $_[0]->tag eq 'a' ? 1 : 0; }
             )) {
            my($s_Name)=$o_A->as_text();
            my($s_Address)=$o_A->attr("href");
            $h_HomePages{$s_Name}=$o_A->attr("href") if (($s_Name =~ m| |) and ($s_Address =~ m|.htm$|));
             };
           };
        $o_Root->delete;
         };
      # now %h_HomePages is a hash keyed on name value being homepage
    
      my($s_Instructor);
      foreach $s_Instructor (sort keys %h_HomePages) {
        $s_HTML=LWP::Simple::get($h_HomePage{$s_Instructor});
        if (defined($s_HTML)) { # We have the data from the url
          my($o_Root)=HTML::TreeBuilder->new_from_content($s_HTML);
    # Do your HTML processing here:
    
    # 
          $o_Root->delete;
           };
         };

  5. #5
    Join Date
    Jun 2004
    Location
    Nowhere Near You
    Posts
    89

    Cool

    What the heck!
    Code:
    #!\user\bin\perl
    
    use File::Path;
    use HTML::TreeBuilder;
    use LWP::Simple;
    use URI;
    
    
      my($s_HTML);
      my($s_URL)="http://155.69.224.75:8000/eeepeople/AcadStaff.asp";
      $s_HTML=LWP::Simple::get("$s_URL");
      my(%h_HomePages);
      if (defined($s_HTML)) { # We have the data from the url
        my($o_Root)=HTML::TreeBuilder->new_from_content($s_HTML);
        my($o_Table);
        foreach $o_Table ($o_Root->look_down(
          sub{ return $_[0]->tag eq 'table' ? 1 : 0; }
           )) { # $o_Table is a complete table
          my($o_A);
          foreach $o_A ($o_Table->look_down(
            sub{ return $_[0]->tag eq 'a' ? 1 : 0; }
             )) {
            my($s_Name)=$o_A->as_text();
            my($s_Address)=$o_A->attr("href");
            $h_HomePages{$s_Name}=$o_A->attr("href") if (($s_Name =~ m| |) and ($s_Address =~ m|.htm$|));
             };
           };
        $o_Root->delete;
         };
      # now %h_HomePages is a hash keyed on name value being homepage
    
      my($s_Instructor);
      foreach $s_Instructor (sort keys %h_HomePages) {
        $s_HTML=LWP::Simple::get($h_HomePage{$s_Instructor});
        if (defined($s_HTML)) { # We have the data from the url
          my($o_Root)=HTML::TreeBuilder->new_from_content($s_HTML);
    # Do your HTML processing here:
    
    # 
          $o_Root->delete;
           };
         };

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •