[http://stackoverflow.com/questions/27106552][]

Using rex may make this type of task a little simpler.

x <- c("a=1", "b=3", "a=9", "c=2", "b=4", "a=2")

First extract the names and values from the strings.

library(rex)
matches <- re_matches(x,
  rex(
    capture(name="name", letter),
    "=",
    capture(name="value", digit)
    ))
matches

#>   name value
#> 1    a     1
#> 2    b     3
#> 3    a     9
#> 4    c     2
#> 5    b     4
#> 6    a     2

Then tally the groups using split().

groups <- split(as.numeric(matches$value), matches$name)
groups

#> $a
#> [1] 1 9 2
#> 
#> $b
#> [1] 3 4
#> 
#> $c
#> [1] 2

If we try to convert directly to a data.frame from split() the groups with fewer members will have their members recycled rather than NA, so instead explicitly fill with NA.

largest_group <- max(sapply(groups, length))
largest_group

#> [1] 3


groups <- lapply(groups, function(group) {
  if (length(group) < largest_group) {
    group[largest_group] <- NA
  }
  group
})
groups

#> $a
#> [1] 1 9 2
#> 
#> $b
#> [1]  3  4 NA
#> 
#> $c
#> [1]  2 NA NA

Finally we can create the data.frame

do.call("data.frame", groups)

#>   a  b  c
#> 1 1  3  2
#> 2 9  4 NA
#> 3 2 NA NA

[http://stackoverflow.com/questions/14146362/][]

Using rex may make this type of task a little simpler.

mystrings <- c("X2/D2/F4",
               "X10/D9/F4",
               "X3/D22/F4",
               "X9/D22/F9")

library(rex)
matches <- re_matches(mystrings,
  rex(
    "/",
    any,
    capture(name = "numbers", digits)
    )
  )
as.numeric(matches$numbers)

#> [1]  2  9 22 22

[http://stackoverflow.com/questions/8613237/][]

Using rex may make this type of task a little simpler.

j <- "What kind of cheese isn't your cheese? (wonder) Nacho cheese! (groan) (Laugh)"

library(rex)
matches <- re_matches(j,
  rex(
    "(",
    capture(name = "text", except_any_of(")")),
    ")"),
  global = TRUE)
matches

#> [[1]]
#>     text
#> 1 wonder
#> 2  groan
#> 3  Laugh

[http://stackoverflow.com/questions/22976472][]

Using rex may make this type of task a little simpler.

txt <- as.character("this is just a test! i'm not sure if this is O.K. or if it will work? who knows. regex is sorta new to me..  There are certain cases that I may not figure out??  sad!  ^_^")

re <- rex(
  capture(name = "first_letter", alnum),
  capture(name = "sentence",
    any_non_puncts,
    zero_or_more(
      group(
        punct %if_next_isnt% space,
        any_non_puncts
        )
      ),
    maybe(punct)
    )
  )

re_substitutes(txt, re, "\\U\\1\\E\\2", global = TRUE)

#> [1] "This is just a test! I'm not sure if this is O.K. Or if it will work? Who knows. Regex is sorta new to me..  There are certain cases that I may not figure out??  Sad!  ^_^"

[http://stackoverflow.com/questions/27172007][]

Using rex may make this type of task a little simpler.

x <- data.frame(
  locationid = c(
    1073744023,
    1073744022,
    1073744025,
    1073744024,
    1073744021,
    1073744026
    ),
  address = c(
    "525 East 68th Street, New York, NY      10065, USA",
    "270 Park Avenue, New York, NY 10017, USA",
    "Rockefeller Center, 50 Rockefeller Plaza, New York, NY 10020, USA",
    "1251 Avenue of the Americas, New York, NY 10020, USA",
    "1301 Avenue of the Americas, New York, NY 10019, USA",
    "44 West 45th Street, New York, NY 10036, USA"
    ))

library(rex)

sep <- rex(",", spaces)

re <-
  rex(
    capture(name = "address",
      except_some_of(",")
    ),
    sep,
    capture(name = "city",
      except_some_of(",")
    ),
    sep,
    capture(name = "state",
      uppers
    ),
    spaces,
    capture(name = "zip",
      some_of(digit, "-")
    ),
    sep,
    capture(name = "country",
      something
    ))

re_matches(x$address, re)

#>                       address     city state   zip country
#> 1        525 East 68th Street New York    NY 10065     USA
#> 2             270 Park Avenue New York    NY 10017     USA
#> 3        50 Rockefeller Plaza New York    NY 10020     USA
#> 4 1251 Avenue of the Americas New York    NY 10020     USA
#> 5 1301 Avenue of the Americas New York    NY 10019     USA
#> 6         44 West 45th Street New York    NY 10036     USA

[http://stackoverflow.com/questions/27155297/][]

Using rex may make this type of task a little simpler.

library(rex)
x <- c(
"https://support.google.com/blogger/topic/12457
https://support.google.com/blogger/topic/12457.
https://support.google.com/blogger/topic/12457] 

https://support.google.com/blogger/topic/12457,
https://support.google.com/blogger/topic/12457),
xxxxxxhttps://support.google.com/blogger/topic/12457),hhhththta")

re <- rex(
  capture(name = "url",
    "https://support.google.com/blogger/topic/",
    digits
    ))

re_matches(x, re, global = TRUE)[[1]]

#>                                              url
#> 1 https://support.google.com/blogger/topic/12457
#> 2 https://support.google.com/blogger/topic/12457
#> 3 https://support.google.com/blogger/topic/12457
#> 4 https://support.google.com/blogger/topic/12457
#> 5 https://support.google.com/blogger/topic/12457
#> 6 https://support.google.com/blogger/topic/12457

[http://stackoverflow.com/questions/27219421][]

Using rex may make this type of task a little simpler.

tmp <- c("Little Street","A323", "Essex Road (A43)", "M43","Orange street","M4","B2045","New Street")

library(rex)
classify_road <- function(x) {
  res <- re_matches(x,
    rex(
      capture(name = "type",
        upper
      ),
      digit
    )
  )

  res$type[ is.na(res$type) ] <- "Minor"
  paste(res$type, "Road")
}

classify_road(tmp)

#> [1] "Minor Road" "A Road"     "A Road"     "M Road"     "Minor Road"
#> [6] "M Road"     "B Road"     "Minor Road"

[http://stackoverflow.com/questions/22247410][]

Using rex may make this type of task a little simpler.

x <- "this is a multiline text 
          some more test here 
          before we get to the good stuff 
          \\end{figure}"

re <- rex("\\end{figure}")
re_matches(x, re)

#> [1] TRUE


regexpr(re, x, perl = TRUE)

#> [1] 111
#> attr(,"match.length")
#> [1] 12
#> attr(,"useBytes")
#> [1] TRUE

[http://stackoverflow.com/questions/23447261][]

Using rex may make this type of task a little simpler.

x = structure(list(text = structure(c(4L, 6L, 1L, 2L, 5L, 3L), .Label =     c("ãããæããããéãããæãããInappropriate announce:-(", 
"@AirAsia your direct debit (Maybank) payment gateways is not working. Is it something     you are working to fix?", 
"@AirAsia Apart from the slight delay and shortage of food on our way back from Phuket, both flights were very smooth. Kudos :)", 
"RT @AirAsia: ØØÙØÙÙÙÙ ÙØØØ ØØØÙ ÙØØØØÙ ØØØØÙÙÙí í Now you can enjoy a #great :D breakfast onboard with our new breakfast meals! :D", 
"xdek ke flight @AirAsia Malaysia to LA... hahah..:p bagi la promo murah2 sikit, kompom aku beli...", 
"You know there is a problem when customer service asks you to wait for 103 minutes and your no is 42 in the queue. X-("
), class = "factor"), created = structure(c(5L, 4L, 4L, 3L, 2L, 
1L), .Label = c("1/2/2014 16:14", "1/2/2014 17:00", "3/2/2014 0:54", 
"3/2/2014 0:58", "3/2/2014 1:28"), class = "factor")), .Names = c("text", 
"created"), class = "data.frame", row.names = c(NA, -6L))

emots <- as.character(outer(c(":", ";", ":-", ";-"), c(")", "(", "]", "[", "D", "o", "O", "P", "p"), paste0))

re_matches(x$text,
  rex(
    capture(name = "emoticons",
      or(emots)
    )
  ),
  global = T)

#> [[1]]
#>   emoticons
#> 1        :D
#> 2        :D
#> 
#> [[2]]
#>   emoticons
#> 1      <NA>
#> 
#> [[3]]
#>   emoticons
#> 1       :-(
#> 
#> [[4]]
#>   emoticons
#> 1      <NA>
#> 
#> [[5]]
#>   emoticons
#> 1        :p
#> 
#> [[6]]
#>   emoticons
#> 1        :)

[http://stackoverflow.com/questions/27234040][]

Using rex may make this type of task a little simpler.

z <- "<TABLE ALIGN=\"RIGHT\" BORDER CELLSPACING=\"0\" CELLPADDING=\"0\">
   <CAPTION><B>MESA HIGH VICTORIES</B></CAPTION>
   <TH>Team</TH>
   <TH>Score</TH>
   <TR ALIGN=\"CENTER\">
   <TD><B>Parkfield High Demons</B></TD>
   <TD><B>28 to 21</B></TD>
   </TR>
   <TR ALIGN=\"CENTER\">
   <TD><B>Burns High Badgers</B></TD>
   <TD><B>14 to 13</B></TD>
   </TR>
   </TABLE>"

re_matches(z,
  rex(
    capture(name="table",
      "<TABLE", zero_or_more(any, type = "lazy"), "<TR"
    )
  ), options="single-line")

#>                                                                                                                                                         table
#> 1 <TABLE ALIGN="RIGHT" BORDER CELLSPACING="0" CELLPADDING="0">\n   <CAPTION><B>MESA HIGH VICTORIES</B></CAPTION>\n   <TH>Team</TH>\n   <TH>Score</TH>\n   <TR

[http://stackoverflow.com/questions/27236435][]

Using rex may make this type of task a little simpler.

x <- "John a11|a12|\n  Ana a21|a22|\n  Jake a31|a23|\n   "

re_matches(x,
  rex(
      any_spaces,
      capture(name = "text",
        except_some_of("|")
      ),
      any_spaces),
  global = TRUE)[[1]]

#>       text
#> 1 John a11
#> 2      a12
#> 3  Ana a21
#> 4      a22
#> 5 Jake a31
#> 6      a23
#> 7

[http://stackoverflow.com/questions/25764839][]

Using rex may make this type of task a little simpler.

x <- "MSGSRRKATPASRTRVGNYEMGRTLGEGSFAKVKYAKNTVTGDQAAIKILDREKVFRHKMVEQLKREISTMKLIKHPNVVEIIEVMASKTKIYIVLELVNGGELFDKIAQQGRLKEDEARRYFQQLINAVDYCHSRGVYHRDLKPENLILDANGVLKVSDFGLSAFSRQVREDGLLHTACGTPNYVAPEVLSDKGYDGAAADVWSCGVILFVLMAGYLPFDEPNLMTLYKRICKAEFSCPPWFSQGAKRVIKRILEPNPITRISIAELLEDEWFKKGYKPPSFDQDDEDITIDDVDAAFSNSKECLVTEKKEKPVSMNAFELISSSSEFSLENLFEKQAQLVKKETRFTSQRSASEIMSKMEETAKPLGFNVRKDNYKIKMKGDKSGRKGQLSVATEVFEVAPSLHVVELRKTGGDTLEFHKVCDSFYKNFSSGLKDVVWNTDAAAEEQKQ"
re_matches(x,
  rex(
    capture(name = "amino_acids",
      n(any, 6),
      "K",
      n(any, 6)
      )
    ),
  global = TRUE)[[1]]

#>      amino_acids
#> 1  MSGSRRKATPASR
#> 2  GEGSFAKVKYAKN
#> 3  GDQAAIKILDREK
#> 4  KMVEQLKREISTM
#> 5  IEVMASKTKIYIV
#> 6  GGELFDKIAQQGR
#> 7  VYHRDLKPENLIL
#> 8  DANGVLKVSDFGL
#> 9  PEVLSDKGYDGAA
#> 10 NLMTLYKRICKAE
#> 11 WFSQGAKRVIKRI
#> 12 LEDEWFKKGYKPP
#> 13 AAFSNSKECLVTE
#> 14 LENLFEKQAQLVK
#> 15 ASEIMSKMEETAK
#> 16 LGFNVRKDNYKIK
#> 17 GDKSGRKGQLSVA
#> 18 HVVELRKTGGDTL
#> 19 VCDSFYKNFSSGL


locs <- re_matches(x,
  rex(
    "K" %if_prev_is% n(any, 6) %if_next_is% n(any, 6)
    ),
  global = TRUE, locations = TRUE)[[1]]

substring(x, locs$start - 6, locs$end + 6)

#>  [1] "MSGSRRKATPASR" "GEGSFAKVKYAKN" "GSFAKVKYAKNTV" "AKVKYAKNTVTGD"
#>  [5] "GDQAAIKILDREK" "KILDREKVFRHKM" "EKVFRHKMVEQLK" "KMVEQLKREISTM"
#>  [9] "REISTMKLIKHPN" "STMKLIKHPNVVE" "IEVMASKTKIYIV" "VMASKTKIYIVLE"
#> [13] "GGELFDKIAQQGR" "AQQGRLKEDEARR" "VYHRDLKPENLIL" "DANGVLKVSDFGL"
#> [17] "PEVLSDKGYDGAA" "NLMTLYKRICKAE" "LYKRICKAEFSCP" "WFSQGAKRVIKRI"
#> [21] "GAKRVIKRILEPN" "LEDEWFKKGYKPP" "EDEWFKKGYKPPS" "WFKKGYKPPSFDQ"
#> [25] "AAFSNSKECLVTE" "ECLVTEKKEKPVS" "CLVTEKKEKPVSM" "VTEKKEKPVSMNA"
#> [29] "LENLFEKQAQLVK" "KQAQLVKKETRFT" "QAQLVKKETRFTS" "ASEIMSKMEETAK"
#> [33] "KMEETAKPLGFNV" "LGFNVRKDNYKIK" "VRKDNYKIKMKGD" "KDNYKIKMKGDKS"
#> [37] "NYKIKMKGDKSGR" "IKMKGDKSGRKGQ" "GDKSGRKGQLSVA" "HVVELRKTGGDTL"
#> [41] "DTLEFHKVCDSFY" "VCDSFYKNFSSGL" "NFSSGLKDVVWNT"

[http://stackoverflow.com/questions/15954171][]

Using rex may make this type of task a little simpler.

x <- c("System configuration: lcpu=96 mem=196608MB ent=16.00")

library(rex)
val <- as.numeric(
  re_matches(x,
    rex("ent=",
      capture(name = "ent", some_of(digit, "."))
      )
    )$ent
  )

[http://stackoverflow.com/questions/27273996][]

Using rex to construct the regular expression may make it more understandable.

x <- c("_A00_A1234B_", "_A00_A12345B_", "_A1_A12345_")

approach #1, assumes always is between the second underscores.

re_matches(x,
  rex(
    "_",
    anything,
    "_",
    capture(anything),
    "_"
  )
)

#>         1
#> 1  A1234B
#> 2 A12345B
#> 3  A12345

approach #2, assumes an alpha, followed by 4 or 5 digits with a possible trailing alpha.

re_matches(x,
  rex(
    capture(
      alpha,
      between(digit, 4, 5),
      maybe(alpha)
    )
  )
)

#>         1
#> 1  A1234B
#> 2 A12345B
#> 3  A12345

[http://stackoverflow.com/questions/27238323][]

Using rex may make this type of task a little simpler.

string = c("ABC3JFD456", "ARST4DS324")

re_matches(string,
  rex(
    capture(name = "first_number", digit)
    )
  )

#>   first_number
#> 1            3
#> 2            4

http://stackoverflow.com/questions/27252250

Using rex may make this type of task a little simpler.

df <- structure(list(Object = c("T00055", "T00055", "E00336", "E00336",
"E00336", "E00336", "T 00054"), Coding = c("T 00055_005_<002_+",
"T 00055_008_<002_+", "E 00336_041_<001_+001_+", "E 00336_041_<001_+001_+001_+",
"E 00336_041_<001_+001_+002_+", "E 00336_041_<001_+001_+002_<",
"T 00054_013_<003_<015_+003_<001_<"), Fn = c(2L, 2L, 3L, 4L,
4L, 4L, 4L), Remaining = c(30L, 30L, 0L, 10L, 56L, 52L, 52L)), .Names = c("Object",
"Coding", "Fn", "Remaining"), row.names = c(NA, -7L), class = "data.frame")

subset(df, grepl(rex(at_least(group("_+", anything), 2)), Coding))

#>   Object                       Coding Fn Remaining
#> 3 E00336      E 00336_041_<001_+001_+  3         0
#> 4 E00336 E 00336_041_<001_+001_+001_+  4        10
#> 5 E00336 E 00336_041_<001_+001_+002_+  4        56
#> 6 E00336 E 00336_041_<001_+001_+002_<  4        52

http://stackoverflow.com/questions/27195734

Using rex may make this type of task a little simpler.

ids <- c("367025001", "CT_341796001", "M13X01692-01", "13C025050901", "13C00699551")

re_substitutes(ids,
  rex(non_digits %or% list("01", end)),
  "",
  global = TRUE)

#> [1] "3670250"    "3417960"    "1301692"    "130250509"  "1300699551"

http://stackoverflow.com/questions/27237233

Using rex may make this type of task a little simpler.

library("rvest")
library("stringr")

minimal <- html("<!doctype html><title>blah</title> <p>&nbsp;foo")

bodytext <- minimal %>%
  html_node("body") %>%
  html_text

re_substitutes(bodytext, rex(spaces), "", global = TRUE)

#> [1] " foo"

http://stackoverflow.com/questions/27227229

string <- "this\\(system) {is} [full]."
library(Hmisc)
gsub("\\\\(.)", "\\1", escapeRegex(string))

#> [1] "this\\(system) {is} [full]."

Alternatively rex may make this type of task a little simpler.

library(rex)
re_substitutes(escape(string), rex("\\", capture(any)), "\\1", global = TRUE)

#> <SQL> 'this(system) {is} [full].'

http://stackoverflow.com/questions/27317497

rex has a vignette for parsing server logs. While the format is not exactly the same as your log you should be able to adapt it to your case fairly easily. As far as reading the log in assuming the file fits in memory your best bet is to read the whole file first with readLines(), then the following will put each field into a data.frame column.

x <- "Feb  6 12:14:14 localhost haproxy[14389]: 10.0.1.2:33317 [06/Feb/2009:12:14:14.655] http-in static/srv1 10/0/30/69/109 200 2750 - - ---- 1/1/1/1/0 0/0 {1wt.eu} {} \"GET /index.html HTTP/1.1\""
library(rex)
re <- rex(

  capture(name = "process_name", alpha),
  "[",
    capture(name = "pid", digits),
  "]:",
  spaces,
  capture(name = "client_ip", any_of(digit, ".")),
  ":",
  capture(name = "client_port", digits),
  spaces,
  "[",
    capture(name = "accept_date", except_some_of("]")),
  "]",
  spaces,
  capture(name = "frontend_name", non_spaces),
  spaces,
  capture(name = "backend_name", except_some_of("/")),
  "/",
  capture(name = "server_name", non_spaces),
  spaces,
  capture(name = "Tq", some_of("-", digit)),
  "/",
  capture(name = "Tw", some_of("-", digit)),
  "/",
  capture(name = "Tc", some_of("-", digit)),
  "/",
  capture(name = "Tr", some_of("-", digit)),
  "/",
  capture(name = "Tt", some_of("+", digit)),
  spaces,
  capture(name = "status_code", digits),
  spaces,
  capture(name = "bytes_read", some_of("+", digit)),
  spaces,
  capture(name = "captured_request_cookie", non_spaces),
  spaces,
  capture(name = "captured_response_cookie", non_spaces),
  spaces,
  capture(name = "termination_state", non_spaces),
  spaces,
  capture(name = "actconn", digits),
  "/",
  capture(name = "feconn", digits),
  "/",
  capture(name = "beconn", digits),
  "/",
  capture(name = "srv_conn", digits),
  "/",
  capture(name = "retries", some_of("+", digit)),
  spaces,
  capture(name = "srv_queue", digits),
  "/",
  capture(name = "backend_queue", digits),
  spaces,
  "{",
    capture(name = "captured_request_headers", except_any_of("}")),
  "}",
  spaces,
  "{",
    capture(name = "captured_response_headers", except_any_of("}")),
  "}",
  spaces,
  double_quote,
    capture(name = "http_request", non_quotes),
  double_quote)

re_matches(x, re)

#>   process_name   pid client_ip client_port              accept_date
#> 1            y 14389  10.0.1.2       33317 06/Feb/2009:12:14:14.655
#>   frontend_name backend_name server_name Tq Tw Tc Tr  Tt status_code
#> 1       http-in       static        srv1 10  0 30 69 109         200
#>   bytes_read captured_request_cookie captured_response_cookie
#> 1       2750                       -                        -
#>   termination_state actconn feconn beconn srv_conn retries srv_queue
#> 1              ----       1      1      1        1       0         0
#>   backend_queue captured_request_headers captured_response_headers
#> 1             0                   1wt.eu                          
#>               http_request
#> 1 GET /index.html HTTP/1.1

http://stackoverflow.com/questions/27422350/extract-character-preceding-first-dot-in-a-string

Using rex may make this type of task a little simpler.

my.data <- read.table(text = '
     my.string  state
     .........    A
     1........    B
     112......    C
     11111....    D
     1111113..    E
     111111111    F
     111111111    G
', header = TRUE, stringsAsFactors = FALSE)

library(rex)

re_matches(my.data$my.string,
  rex(capture(except(".")), "."))$'1'

#> [1] NA  "1" "2" "1" "3" NA  NA

http://stackoverflow.com/questions/27410736

Using rex may make this type of task a little simpler.

string <- "Shakira - Wolf - 02.Hips don't lie.mp3"

library(rex)
re_matches(string,
  rex(capture(zero_or_more(any, type='lazy')), spaces, "-"))$'1'

#> [1] "Shakira"

http://stackoverflow.com/questions/27400286/

Using rex may make this type of task a little simpler.

string <- "I t is tim e to g o" 
library(rex)
re_substitutes(string, rex(
    space %if_next_is%
      list(
        list(non_space, space, at_least(non_space, 2)) %or%
        list(non_space, end)
      )
    ), "", global = TRUE)

#> [1] "It is time to go"

http://stackoverflow.com/questions/27553126

Using rex may make this type of task a little simpler.

string <- "01:04:43.064 [12439] <2> xyz
01:04:43.067 [12439] <2> a lmn
01:04:43.068 [12439] <4> j klm
x_times_wait to <3000>
01:04:43.068 [12439] <4> j klm
enter_object <5000> main k"

library(rex)

timestamp <- rex(n(digit, 2), ":", n(digit, 2), ":", n(digit, 2), ".", n(digit, 3))

re <- rex(timestamp, space,
          "[", digits, "]", space,
          "<", digits, ">", space,
          capture(anything))

re_matches(string, re, global = TRUE)

#> [[1]]
#>       1
#> 1   xyz
#> 2 a lmn
#> 3 j klm
#> 4 j klm