@@ -34,13 +34,18 @@ type walker struct {
3434
3535 begin int
3636 end int
37+
38+ // ext holds the file extension of the current file.
39+ ext string
3740}
3841
3942func newWalker (f * core.File , raw []byte , offset int ) * walker {
4043 return & walker {
4144 lines : len (f .Lines ) + offset ,
4245 context : string2ByteSlice (f .Content ),
43- z : html .NewTokenizer (bytes .NewReader (raw ))}
46+ z : html .NewTokenizer (bytes .NewReader (raw )),
47+ ext : f .NormedExt ,
48+ }
4449}
4550
4651func (w * walker ) sub (sub string , char rune ) bool {
@@ -137,10 +142,20 @@ func (w *walker) walk() (html.TokenType, html.Token, string) {
137142
138143func (w * walker ) replaceToks (tok html.Token ) {
139144 tags := core .StringInSlice (tok .Data , []string {
140- "img" , "a" , "p" , "script" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" })
145+ "img" , "a" , "p" , "script" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "span" })
141146 if tags {
147+ names := []string {"href" , "id" , "src" , "alt" }
148+ if w .ext == ".html" {
149+ // We need to handle cases in which inline tags include `class` attributes, which may
150+ // contain substrings that match our actual findings. The challenge is that many of our
151+ // supported formats inject these *after* converting to HTML, so we can't find them in
152+ // the original text.
153+ //
154+ // See testdata/fixtures/patterns/{test2.rst, test3.html} for examples.
155+ names = append (names , "class" )
156+ }
142157 for _ , a := range tok .Attr {
143- if core .StringInSlice (a .Key , [] string { "href" , "id" , "src" , "alt" } ) {
158+ if core .StringInSlice (a .Key , names ) {
144159 if a .Key == "href" {
145160 a .Val , _ = url .QueryUnescape (a .Val )
146161 }
0 commit comments