Gregory:
Try the following function
getBeneficialfromSECDEF14A[cik_] :=
Module[{paddedCIK, urlfullpath, searchResults, textOnlylinks,
top1linkfromList, formDEF14A, htmltagstartpos, htmltagendpos,
htmlDEF14A, DEF14ANoAttribs, tablestartpos, tablestartnearfunc,
tableendpos, tableendnearfunc, benpos, tablestartnearest,
tableendnearest,
tablestarttally, tablestartcommon, tableendtally, tableendcommon,
bentablestart, bentableend, bentable},
paddedCIK = IntegerString[ToExpression[cik], 10, 10];
urlfullpath =
"http://www.sec.gov/cgi-bin/srch-edgar?text=CIK%3D" <> paddedCIK <>
"+TYPE%3DDEF&first=1994&last=" <>
DateString[DateList[], "Year"] <> "";
searchResults = Import[urlfullpath, "Hyperlinks"];
textOnlylinks =
Select[searchResults, Function[StringMatchQ[#, "*.txt"] == True]];
top1linkfromList = First[textOnlylinks];
formDEF14A = Import[top1linkfromList, "Plaintext"];
htmltagstartpos =
StringPosition[formDEF14A, "<html>", IgnoreCase -> True];
htmltagendpos =
StringPosition[formDEF14A, "</html>", IgnoreCase -> True];
htmlDEF14A =
StringTake[
formDEF14A, {First[Flatten[htmltagstartpos]],
Last[Flatten[htmltagendpos]]}];
DEF14ANoAttribs =
StringReplace[htmlDEF14A,
RegularExpression["(<\\w+)[^>]*(>)"] -> "$1$2"];
DEF14ANoAttribs =
StringReplace[
DEF14ANoAttribs, {"<br>" -> " ", "<hr>" -> " ", " " -> " ",
"<u>" -> "", "</u>" -> "", "<b>" -> "", "</b>" -> "",
"</B>" -> "", "<font>" -> " ", "</font>" -> " ",
"<FONT>" -> " ", "</FONT>" -> " ", "<small>" -> "",
"</small>" -> "", "> " -> ">", " <" -> "<", " " -> " "},
IgnoreCase -> True];
DEF14ANoAttribs =
StringReplace[DEF14ANoAttribs, RegularExpression["\\n\\n"] -> ""];
DEF14ANoAttribs = ReplaceRepeated[DEF14ANoAttribs, {" " -> " "}];
tablestartpos =
StringPosition[DEF14ANoAttribs, "<table>", IgnoreCase -> True];
tablestartnearfunc = Nearest[tablestartpos];
tableendpos =
StringPosition[DEF14ANoAttribs, "</table>", IgnoreCase -> True];
tableendnearfunc = Nearest[tableendpos];
benpos =
StringPosition[DEF14ANoAttribs, "beneficial", IgnoreCase -> True];
(*bnearfunc=Nearest[bpos];*)
tablestartnearest = Flatten[Map[tablestartnearfunc, benpos], 1];
tableendnearest = Flatten[Map[tableendnearfunc, benpos], 1];
tablestarttally = Tally[tablestartnearest];
tablestartcommon = Commonest[tablestartnearest];
tableendtally = Tally[tableendnearest];
tableendcommon = Commonest[tableendnearest];
bentablestart = Min[tablestartcommon];
bentableend = Min[tableendcommon];
If[bentableend < bentablestart,
(* find other table end*)
bentableend =
SelectFirst[tableendnearest[[All, 2]], bentablestart < # &];
];
bentable =
ImportString[
StringTake[
DEF14ANoAttribs, {bentablestart, bentableend}], {"HTML",
"Data"}];
Return[bentable];];
With a test of HPQ (Hewlett Packard) cik of 47217
getBeneficialfromSECDEF14A[47217]
{{{"Name of Beneficial Owner",
"Shares of Common Stock Beneficially Owned",
"Percent of Common Stock Outstanding"}, {"Dodge & Cox (1)",
"171,145,618", 9., "%"}, {"State Street Corporation (2)",
"97,792,253", 5.1, "%"}, {"Marc L. Andreessen (3)", "40,740",
"*"}, {"Shumeet Banerji", "32,694", "*"}, {"Robert R. Bennett",
"4,262", "*"}, {"Rajiv L. Gupta (4)", "71,271",
"*"}, {"Klaus Kleinfeld", "\[LongDash]",
"*"}, {"Raymond J. Lane (5)", "462,618",
"*"}, {"Ann M. Livermore (6)", "318,742",
"*"}, {"Raymond E. Ozzie", "4,262", "*"}, {"Gary M. Reiner (7)",
"82,535", "*"}, {"Patricia F. Russo (8)", "20,888",
"*"}, {"James A. Skinner", "4,262",
"*"}, {"Margaret C. Whitman (9)", "4,419,346",
"*"}, {"Catherine A. Lesjak (10)", "875,905",
"*"}, {"William L. Veghte (11)", "385,953",
"*"}, {"Dion J. Weisler (12)", "12,500",
"*"}, {"Michael G. Nefkens (13)", "461,979",
"*"}, {"All current executive officers and directors as a group \
(24 persons) (14)", "7,838,018", "*"}}, {"*",
"Represents holdings of less than 1%."}}
I also tried this with Microsoft and the function seems to have extracted the Beneficial Owner table
getBeneficialfromSECDEF14A[789019]
Give this a trial run it may be slow you may wish to remove the attribute and other html cleaning. I was also taking advantage of the built-in Nearest function. Maybe this function could be optimized by using the correct options.
(Addition)
Here is a different version without the cleaning of the HTML leaving that to built-in commands:
getBeneficialfromSECDEF14A[cik_] :=
Module[{formDEF14A, tablestartpos, tablestartnearfunc, tableendpos,
tableendnearfunc, benpos, tablestartnearest, tableendnearest,
tablestartcommon, tableendcommon, bentablestart, bentableend,
bentable} ,
formDEF14A =
Import[SelectFirst[
Import["http://www.sec.gov/cgi-bin/srch-edgar?text=CIK%3D" <>
IntegerString[ToExpression[cik], 10, 10] <>
"+TYPE%3DDEF&first=1994&last=" <>
DateString[DateList[], "Year"], "Hyperlinks"],
Function[StringMatchQ[#, "*.txt"] == True]], "Plaintext"];
tablestartpos =
StringPosition[formDEF14A, "<table", IgnoreCase -> True];
tablestartnearfunc = Nearest[tablestartpos];
tableendpos =
StringPosition[formDEF14A, "</table>", IgnoreCase -> True];
tableendnearfunc = Nearest[tableendpos];
benpos =
StringPosition[formDEF14A, "beneficial", IgnoreCase -> True];
tablestartnearest = Flatten[Map[tablestartnearfunc, benpos], 1];
tableendnearest = Flatten[Map[tableendnearfunc, benpos], 1];
tablestartcommon = Commonest[tablestartnearest];
tableendcommon = Commonest[tableendnearest];
bentablestart = Min[tablestartcommon];
bentableend = Min[tableendcommon];
If[bentableend < bentablestart,(*find other table end*)
bentableend =
SelectFirst[tableendnearest[[All, 2]],
Function[Less[bentablestart, #]]];];
bentable =
ImportString[
StringTake[formDEF14A, {bentablestart, bentableend}], {"HTML",
"Data"}];
Return[bentable];];
Hans